github.com/axw/juju@v0.0.0-20161005053422-4bd6544d08d4/state/workers/restart.go (about)

     1  // Copyright 2016 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package workers
     5  
     6  import (
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/juju/errors"
    11  	"github.com/juju/loggo"
    12  	"github.com/juju/utils/clock"
    13  
    14  	"github.com/juju/juju/core/lease"
    15  	"github.com/juju/juju/worker"
    16  	"github.com/juju/juju/worker/catacomb"
    17  )
    18  
    19  // RestartConfig holds a RestartWorkers' dependencies and configuration.
    20  type RestartConfig struct {
    21  	Factory Factory
    22  	Logger  loggo.Logger
    23  	Clock   clock.Clock
    24  	Delay   time.Duration
    25  }
    26  
    27  // Validate returns an error if config cannot drive a RestartWorkers.
    28  func (config RestartConfig) Validate() error {
    29  	if config.Factory == nil {
    30  		return errors.NotValidf("nil Factory")
    31  	}
    32  	if config.Logger == (loggo.Logger{}) {
    33  		return errors.NotValidf("uninitialized Logger")
    34  	}
    35  	if config.Clock == nil {
    36  		return errors.NotValidf("nil Clock")
    37  	}
    38  	if config.Delay <= 0 {
    39  		return errors.NotValidf("non-positive Delay")
    40  	}
    41  	return nil
    42  }
    43  
    44  // NewRestartWorkers returns a worker that will live until Kill()ed,
    45  // giving access to a set of sub-workers needed by the state package.
    46  //
    47  // These workers may die of their own accord at any time, and will be
    48  // replaced after the configured delay; all active workers will be
    49  // stopped before Wait returns.
    50  func NewRestartWorkers(config RestartConfig) (*RestartWorkers, error) {
    51  	if err := config.Validate(); err != nil {
    52  		return nil, errors.Trace(err)
    53  	}
    54  
    55  	dw, err := NewDumbWorkers(DumbConfig{
    56  		Factory: config.Factory,
    57  		Logger:  config.Logger,
    58  	})
    59  	if err != nil {
    60  		return nil, errors.Trace(err)
    61  	}
    62  
    63  	rw := &RestartWorkers{
    64  		config:  config,
    65  		workers: dw,
    66  	}
    67  	err = catacomb.Invoke(catacomb.Plan{
    68  		Site: &rw.catacomb,
    69  		Work: rw.run,
    70  		Init: []worker.Worker{dw},
    71  	})
    72  	if err != nil {
    73  		return nil, errors.Trace(err)
    74  	}
    75  	return rw, nil
    76  }
    77  
    78  // RestartWorkers wraps a DumbWorkers and restarts/replaces workers as
    79  // they fail.
    80  type RestartWorkers struct {
    81  	config   RestartConfig
    82  	catacomb catacomb.Catacomb
    83  
    84  	// mu protects workers.
    85  	mu      sync.Mutex
    86  	workers *DumbWorkers
    87  
    88  	// wg tracks maintainer goroutines.
    89  	wg sync.WaitGroup
    90  }
    91  
    92  // TxnLogWatcher is part of the Workers interface.
    93  func (rw *RestartWorkers) TxnLogWatcher() TxnLogWatcher {
    94  	rw.mu.Lock()
    95  	defer rw.mu.Unlock()
    96  	return rw.workers.txnLogWorker
    97  }
    98  
    99  // PresenceWatcher is part of the Workers interface.
   100  func (rw *RestartWorkers) PresenceWatcher() PresenceWatcher {
   101  	rw.mu.Lock()
   102  	defer rw.mu.Unlock()
   103  	return rw.workers.presenceWorker
   104  }
   105  
   106  // LeadershipManager is part of the Workers interface.
   107  func (rw *RestartWorkers) LeadershipManager() LeaseManager {
   108  	return DynamicLeaseManager{&rw.mu, &rw.workers.leadershipWorker}
   109  }
   110  
   111  // SingularManager is part of the Workers interface.
   112  func (rw *RestartWorkers) SingularManager() LeaseManager {
   113  	return DynamicLeaseManager{&rw.mu, &rw.workers.singularWorker}
   114  }
   115  
   116  // Kill is part of the worker.Worker interface.
   117  func (rw *RestartWorkers) Kill() {
   118  	rw.catacomb.Kill(nil)
   119  }
   120  
   121  // Wait is part of the worker.Worker interface.
   122  func (rw *RestartWorkers) Wait() error {
   123  	return rw.catacomb.Wait()
   124  }
   125  
   126  func (rw *RestartWorkers) run() error {
   127  
   128  	replacers := []replacer{
   129  		&txnLogWorkerReplacer{
   130  			start:   rw.config.Factory.NewTxnLogWorker,
   131  			current: rw.workers.txnLogWorker,
   132  			target:  &rw.workers.txnLogWorker,
   133  		},
   134  		&presenceWorkerReplacer{
   135  			start:   rw.config.Factory.NewPresenceWorker,
   136  			current: rw.workers.presenceWorker,
   137  			target:  &rw.workers.presenceWorker,
   138  		},
   139  		&leaseWorkerReplacer{
   140  			start:   rw.config.Factory.NewLeadershipWorker,
   141  			current: rw.workers.leadershipWorker,
   142  			target:  &rw.workers.leadershipWorker,
   143  		},
   144  		&leaseWorkerReplacer{
   145  			start:   rw.config.Factory.NewSingularWorker,
   146  			current: rw.workers.singularWorker,
   147  			target:  &rw.workers.singularWorker,
   148  		},
   149  	}
   150  
   151  	// begin critical section: cannot touch workers without mutex
   152  	for _, replacer := range replacers {
   153  		rw.wg.Add(1)
   154  		go rw.maintain(replacer)
   155  	}
   156  	<-rw.catacomb.Dying()
   157  	rw.wg.Wait()
   158  	// end critical section: potential workers writes all finished
   159  
   160  	return worker.Stop(rw.workers)
   161  }
   162  
   163  // maintain drives a replacer. See commentary in func, and docs on
   164  // the replacer interface.
   165  func (rw *RestartWorkers) maintain(replacer replacer) {
   166  
   167  	// Signal to the RestartWorkers that we've stopped trying to
   168  	// maintain a worker once we return from this func.
   169  	defer rw.wg.Done()
   170  
   171  	// First, wait until the worker actually needs replacement.
   172  	select {
   173  	case <-rw.catacomb.Dying():
   174  		return
   175  	case <-replacer.needed():
   176  	}
   177  
   178  	// Then try to create a replacement until we succeed...
   179  	for {
   180  		select {
   181  		case <-rw.catacomb.Dying():
   182  			return
   183  		case <-rw.config.Clock.After(rw.config.Delay):
   184  		}
   185  		if replacer.prepare() {
   186  			break
   187  		}
   188  	}
   189  
   190  	// ...at which point it's OK to take the lock for long enough to
   191  	// set the replacement worker.
   192  	rw.mu.Lock()
   193  	replacer.replace()
   194  	rw.mu.Unlock()
   195  
   196  	// Finally, signal to the RestartWorkers that we'll maintain the
   197  	// new worker, effectively undoing the deferred Done above...
   198  	rw.wg.Add(1)
   199  
   200  	// ...and start again from the top.
   201  	go rw.maintain(replacer)
   202  }
   203  
   204  // replacer exists to satisfy the very narrow constraints of the
   205  // RestartWorkers.maintain method. The methods will be called
   206  // in the order defined, as annotated:
   207  type replacer interface {
   208  
   209  	// needed returns a channel that will be closed when the
   210  	// original worker has failed and needs to be restarted;
   211  	// once this has happened...
   212  	needed() <-chan struct{}
   213  
   214  	// ...prepare will then be called repeatedly until it returns
   215  	// true, indicating that it's created a replacement worker; at
   216  	// which point...
   217  	prepare() bool
   218  
   219  	// ...the workers mutex will be acquired, and it's safe for the
   220  	// replacer to write the new worker to the target pointer (and
   221  	// update its own internal references so that the next call to
   222  	// needed() returns a channel tied to the new worker's
   223  	// lifetime).
   224  	replace()
   225  
   226  	// The actual *implementation* of the various kinds of replacer
   227  	// should not vary -- they'd be great candidates for codegen or
   228  	// even generics(!).
   229  }
   230  
   231  // txnLogWorkerReplacer implements replacer. Apart from the types, it
   232  // should be identical to presenceWorkerReplacer and leaseWorkerReplacer.
   233  type txnLogWorkerReplacer struct {
   234  	start   func() (TxnLogWorker, error)
   235  	current TxnLogWorker
   236  	next    TxnLogWorker
   237  	target  *TxnLogWorker
   238  }
   239  
   240  func (r *txnLogWorkerReplacer) needed() <-chan struct{} {
   241  	return worker.Dead(r.current)
   242  }
   243  
   244  func (r *txnLogWorkerReplacer) prepare() bool {
   245  	var err error
   246  	r.next, err = r.start()
   247  	return err == nil
   248  }
   249  
   250  func (r *txnLogWorkerReplacer) replace() {
   251  	*r.target = r.next
   252  	r.current = r.next
   253  	r.next = nil
   254  }
   255  
   256  // presenceWorkerReplacer implements replacer. Apart from the types, it
   257  // should be identical to txnLogWorkerReplacer and leaseWorkerReplacer.
   258  type presenceWorkerReplacer struct {
   259  	start   func() (PresenceWorker, error)
   260  	current PresenceWorker
   261  	next    PresenceWorker
   262  	target  *PresenceWorker
   263  }
   264  
   265  func (r *presenceWorkerReplacer) needed() <-chan struct{} {
   266  	return worker.Dead(r.current)
   267  }
   268  
   269  func (r *presenceWorkerReplacer) prepare() bool {
   270  	var err error
   271  	r.next, err = r.start()
   272  	return err == nil
   273  }
   274  
   275  func (r *presenceWorkerReplacer) replace() {
   276  	*r.target = r.next
   277  	r.current = r.next
   278  	r.next = nil
   279  }
   280  
   281  // leaseWorkerReplacer implements replacer. Apart from the types, it
   282  // should be identical to presenceWorkerReplacer and txnLogWorkerReplacer.
   283  type leaseWorkerReplacer struct {
   284  	start   func() (LeaseWorker, error)
   285  	current LeaseWorker
   286  	next    LeaseWorker
   287  	target  *LeaseWorker
   288  }
   289  
   290  func (r *leaseWorkerReplacer) needed() <-chan struct{} {
   291  	return worker.Dead(r.current)
   292  }
   293  
   294  func (r *leaseWorkerReplacer) prepare() bool {
   295  	var err error
   296  	r.next, err = r.start()
   297  	return err == nil
   298  }
   299  
   300  func (r *leaseWorkerReplacer) replace() {
   301  	*r.target = r.next
   302  	r.current = r.next
   303  	r.next = nil
   304  }
   305  
   306  // DynamicLeaseManager is a workers.LeaseManager that calls a given function
   307  // to acquire a fresh LeaseManager for each method call. This enables us to
   308  // hide the fact that workers returned from RestartManager may become stale.
   309  type DynamicLeaseManager struct {
   310  	mu *sync.Mutex
   311  	w  *LeaseWorker
   312  }
   313  
   314  // Claim is part of the lease.Claimer interface.
   315  func (d DynamicLeaseManager) Claim(leaseName, holderName string, duration time.Duration) error {
   316  	return d.Underlying().Claim(leaseName, holderName, duration)
   317  }
   318  
   319  // WaitUntilExpired is part of the lease.Claimer interface.
   320  func (d DynamicLeaseManager) WaitUntilExpired(leaseName string) error {
   321  	return d.Underlying().WaitUntilExpired(leaseName)
   322  }
   323  
   324  // Token is part of the lease.Checker interface.
   325  func (d DynamicLeaseManager) Token(leaseName, holderName string) lease.Token {
   326  	return d.Underlying().Token(leaseName, holderName)
   327  }
   328  
   329  // Underlying returns the current underlying LeaseManager.
   330  func (d DynamicLeaseManager) Underlying() LeaseManager {
   331  	d.mu.Lock()
   332  	defer d.mu.Unlock()
   333  	return *d.w
   334  }