github.com/makyo/juju@v0.0.0-20160425123129-2608902037e9/worker/runner.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package worker
     5  
     6  import (
     7  	"time"
     8  
     9  	"github.com/juju/errors"
    10  	"github.com/juju/loggo"
    11  	"launchpad.net/tomb"
    12  )
    13  
    14  var logger = loggo.GetLogger("juju.worker")
    15  
    16  // RestartDelay holds the length of time that a worker
    17  // will wait between exiting and restarting.
    18  const RestartDelay = 3 * time.Second
    19  
    20  // Worker is implemented by a running worker.
    21  type Worker interface {
    22  	// Kill asks the worker to stop without necessarily
    23  	// waiting for it to do so.
    24  	Kill()
    25  	// Wait waits for the worker to exit and returns any
    26  	// error encountered when it was running.
    27  	Wait() error
    28  }
    29  
    30  // Runner is implemented by instances capable of starting and stopping workers.
    31  type Runner interface {
    32  	Worker
    33  	StartWorker(id string, startFunc func() (Worker, error)) error
    34  	StopWorker(id string) error
    35  }
    36  
    37  // runner runs a set of workers, restarting them as necessary
    38  // when they fail.
    39  type runner struct {
    40  	tomb          tomb.Tomb
    41  	startc        chan startReq
    42  	stopc         chan string
    43  	donec         chan doneInfo
    44  	startedc      chan startInfo
    45  	isFatal       func(error) bool
    46  	moreImportant func(err0, err1 error) bool
    47  
    48  	// restartDelay holds the length of time that a worker
    49  	// will wait between exiting and restarting.
    50  	restartDelay time.Duration
    51  }
    52  
    53  type startReq struct {
    54  	id    string
    55  	start func() (Worker, error)
    56  }
    57  
    58  type startInfo struct {
    59  	id     string
    60  	worker Worker
    61  }
    62  
    63  type doneInfo struct {
    64  	id  string
    65  	err error
    66  }
    67  
    68  // NewRunner creates a new Runner.  When a worker finishes, if its error
    69  // is deemed fatal (determined by calling isFatal), all the other workers
    70  // will be stopped and the runner itself will finish.  Of all the fatal errors
    71  // returned by the stopped workers, only the most important one,
    72  // determined by calling moreImportant, will be returned from
    73  // Runner.Wait. Non-fatal errors will not be returned.
    74  //
    75  // The function isFatal(err) returns whether err is a fatal error.  The
    76  // function moreImportant(err0, err1) returns whether err0 is considered
    77  // more important than err1.
    78  func NewRunner(isFatal func(error) bool, moreImportant func(err0, err1 error) bool, restartDelay time.Duration) Runner {
    79  	runner := &runner{
    80  		startc:        make(chan startReq),
    81  		stopc:         make(chan string),
    82  		donec:         make(chan doneInfo),
    83  		startedc:      make(chan startInfo),
    84  		isFatal:       isFatal,
    85  		moreImportant: moreImportant,
    86  		restartDelay:  restartDelay,
    87  	}
    88  	go func() {
    89  		defer runner.tomb.Done()
    90  		runner.tomb.Kill(runner.run())
    91  	}()
    92  	return runner
    93  }
    94  
    95  var ErrDead = errors.New("worker runner is not running")
    96  
    97  // StartWorker starts a worker running associated with the given id.
    98  // The startFunc function will be called to create the worker;
    99  // when the worker exits, it will be restarted as long as it
   100  // does not return a fatal error.
   101  //
   102  // If there is already a worker with the given id, nothing will be done.
   103  //
   104  // StartWorker returns ErrDead if the runner is not running.
   105  func (runner *runner) StartWorker(id string, startFunc func() (Worker, error)) error {
   106  	select {
   107  	case runner.startc <- startReq{id, startFunc}:
   108  		return nil
   109  	case <-runner.tomb.Dead():
   110  	}
   111  	return ErrDead
   112  }
   113  
   114  // StopWorker stops the worker associated with the given id.
   115  // It does nothing if there is no such worker.
   116  //
   117  // StopWorker returns ErrDead if the runner is not running.
   118  func (runner *runner) StopWorker(id string) error {
   119  	select {
   120  	case runner.stopc <- id:
   121  		return nil
   122  	case <-runner.tomb.Dead():
   123  	}
   124  	return ErrDead
   125  }
   126  
   127  func (runner *runner) Wait() error {
   128  	return runner.tomb.Wait()
   129  }
   130  
   131  func (runner *runner) Kill() {
   132  	logger.Debugf("killing runner %p", runner)
   133  	runner.tomb.Kill(nil)
   134  }
   135  
   136  // Stop kills the given worker and waits for it to exit.
   137  func Stop(worker Worker) error {
   138  	worker.Kill()
   139  	return worker.Wait()
   140  }
   141  
   142  type workerInfo struct {
   143  	start        func() (Worker, error)
   144  	worker       Worker
   145  	restartDelay time.Duration
   146  	stopping     bool
   147  }
   148  
   149  func (runner *runner) run() error {
   150  	// workers holds the current set of workers.  All workers with a
   151  	// running goroutine have an entry here.
   152  	workers := make(map[string]*workerInfo)
   153  	var finalError error
   154  
   155  	// isDying holds whether the runner is currently dying.  When it
   156  	// is dying (whether as a result of being killed or due to a
   157  	// fatal error), all existing workers are killed, no new workers
   158  	// will be started, and the loop will exit when all existing
   159  	// workers have stopped.
   160  	isDying := false
   161  	tombDying := runner.tomb.Dying()
   162  	for {
   163  		if isDying && len(workers) == 0 {
   164  			return finalError
   165  		}
   166  		select {
   167  		case <-tombDying:
   168  			logger.Infof("runner is dying")
   169  			isDying = true
   170  			killAll(workers)
   171  			tombDying = nil
   172  		case req := <-runner.startc:
   173  			if isDying {
   174  				logger.Infof("ignoring start request for %q when dying", req.id)
   175  				break
   176  			}
   177  			info := workers[req.id]
   178  			if info == nil {
   179  				workers[req.id] = &workerInfo{
   180  					start:        req.start,
   181  					restartDelay: runner.restartDelay,
   182  				}
   183  				go runner.runWorker(0, req.id, req.start)
   184  				break
   185  			}
   186  			if !info.stopping {
   187  				// The worker is already running, so leave it alone
   188  				break
   189  			}
   190  			// The worker previously existed and is
   191  			// currently being stopped.  When it eventually
   192  			// does stop, we'll restart it immediately with
   193  			// the new start function.
   194  			info.start = req.start
   195  			info.restartDelay = 0
   196  		case id := <-runner.stopc:
   197  			logger.Debugf("stop %q", id)
   198  			if info := workers[id]; info != nil {
   199  				killWorker(id, info)
   200  			}
   201  		case info := <-runner.startedc:
   202  			logger.Debugf("%q started", info.id)
   203  			workerInfo := workers[info.id]
   204  			workerInfo.worker = info.worker
   205  			if isDying || workerInfo.stopping {
   206  				killWorker(info.id, workerInfo)
   207  			}
   208  		case info := <-runner.donec:
   209  			logger.Debugf("%q done: %v", info.id, info.err)
   210  			workerInfo := workers[info.id]
   211  			if !workerInfo.stopping && info.err == nil {
   212  				logger.Debugf("removing %q from known workers", info.id)
   213  				delete(workers, info.id)
   214  				break
   215  			}
   216  			if info.err != nil {
   217  				if runner.isFatal(info.err) {
   218  					logger.Errorf("fatal %q: %v", info.id, info.err)
   219  					if finalError == nil || runner.moreImportant(info.err, finalError) {
   220  						finalError = info.err
   221  					}
   222  					delete(workers, info.id)
   223  					if !isDying {
   224  						isDying = true
   225  						killAll(workers)
   226  					}
   227  					break
   228  				} else {
   229  					logger.Errorf("exited %q: %v", info.id, info.err)
   230  				}
   231  			}
   232  			if workerInfo.start == nil {
   233  				logger.Debugf("no restart, removing %q from known workers", info.id)
   234  
   235  				// The worker has been deliberately stopped;
   236  				// we can now remove it from the list of workers.
   237  				delete(workers, info.id)
   238  				break
   239  			}
   240  			go runner.runWorker(workerInfo.restartDelay, info.id, workerInfo.start)
   241  			workerInfo.restartDelay = runner.restartDelay
   242  		}
   243  	}
   244  }
   245  
   246  func killAll(workers map[string]*workerInfo) {
   247  	for id, info := range workers {
   248  		killWorker(id, info)
   249  	}
   250  }
   251  
   252  func killWorker(id string, info *workerInfo) {
   253  	if info.worker != nil {
   254  		logger.Debugf("killing %q", id)
   255  		info.worker.Kill()
   256  		info.worker = nil
   257  	} else {
   258  		logger.Debugf("couldn't kill %q, not yet started", id)
   259  	}
   260  	info.stopping = true
   261  	info.start = nil
   262  }
   263  
   264  // runWorker starts the given worker after waiting for the given delay.
   265  func (runner *runner) runWorker(delay time.Duration, id string, start func() (Worker, error)) {
   266  	if delay > 0 {
   267  		logger.Infof("restarting %q in %v", id, delay)
   268  		select {
   269  		case <-runner.tomb.Dying():
   270  			runner.donec <- doneInfo{id, nil}
   271  			return
   272  		case <-time.After(delay):
   273  		}
   274  	}
   275  	logger.Infof("start %q", id)
   276  	worker, err := start()
   277  	if err == nil {
   278  		runner.startedc <- startInfo{id, worker}
   279  		err = worker.Wait()
   280  	}
   281  	logger.Infof("stopped %q, err: %v", id, err)
   282  	runner.donec <- doneInfo{id, err}
   283  }
   284  
   285  // Workers is an order-preserving registry of worker factory functions.
   286  type Workers struct {
   287  	ids   []string
   288  	funcs map[string]func() (Worker, error)
   289  }
   290  
   291  // NewWorkers returns a new Workers.
   292  func NewWorkers() Workers {
   293  	return Workers{
   294  		funcs: make(map[string]func() (Worker, error)),
   295  	}
   296  }
   297  
   298  // IDs returns the list of registered worker IDs.
   299  func (r Workers) IDs() []string {
   300  	ids := make([]string, len(r.ids))
   301  	copy(ids, r.ids)
   302  	return ids
   303  }
   304  
   305  // Add registered the factory function for the identified worker.
   306  func (r *Workers) Add(id string, newWorker func() (Worker, error)) error {
   307  	if _, ok := r.funcs[id]; ok {
   308  		return errors.Errorf("worker %q already registered", id)
   309  	}
   310  	r.funcs[id] = newWorker
   311  	r.ids = append(r.ids, id)
   312  	return nil
   313  }
   314  
   315  // Start starts all the registered workers under the given runner.
   316  func (r *Workers) Start(runner Runner) error {
   317  	for _, id := range r.ids {
   318  		newWorker := r.funcs[id]
   319  		if err := runner.StartWorker(id, newWorker); err != nil {
   320  			return errors.Annotatef(err, "worker %q failed to start", id)
   321  		}
   322  	}
   323  	return nil
   324  }