github.com/wallyworld/juju@v0.0.0-20161013125918-6cf1bc9d917a/worker/leadership/tracker.go (about)

     1  // Copyright 2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package leadership
     5  
     6  import (
     7  	"time"
     8  
     9  	"github.com/juju/errors"
    10  	"github.com/juju/loggo"
    11  	"github.com/juju/utils/clock"
    12  	"gopkg.in/juju/names.v2"
    13  	"gopkg.in/tomb.v1"
    14  
    15  	"github.com/juju/juju/core/leadership"
    16  )
    17  
    18  var logger = loggo.GetLogger("juju.worker.leadership")
    19  
    20  type Tracker struct {
    21  	tomb            tomb.Tomb
    22  	claimer         leadership.Claimer
    23  	unitName        string
    24  	applicationName string
    25  	clock           clock.Clock
    26  	duration        time.Duration
    27  	isMinion        bool
    28  
    29  	claimLease        chan struct{}
    30  	renewLease        <-chan time.Time
    31  	claimTickets      chan chan bool
    32  	waitLeaderTickets chan chan bool
    33  	waitMinionTickets chan chan bool
    34  	waitingLeader     []chan bool
    35  	waitingMinion     []chan bool
    36  }
    37  
    38  // NewTracker returns a *Tracker that attempts to claim and retain service
    39  // leadership for the supplied unit. It will claim leadership for twice the
    40  // supplied duration, and once it's leader it will renew leadership every
    41  // time the duration elapses.
    42  // Thus, successful leadership claims on the resulting Tracker will guarantee
    43  // leadership for the duration supplied here without generating additional
    44  // calls to the supplied manager (which may very well be on the other side of
    45  // a network connection).
    46  func NewTracker(tag names.UnitTag, claimer leadership.Claimer, clock clock.Clock, duration time.Duration) *Tracker {
    47  	unitName := tag.Id()
    48  	serviceName, _ := names.UnitApplication(unitName)
    49  	t := &Tracker{
    50  		unitName:          unitName,
    51  		applicationName:   serviceName,
    52  		claimer:           claimer,
    53  		clock:             clock,
    54  		duration:          duration,
    55  		claimTickets:      make(chan chan bool),
    56  		waitLeaderTickets: make(chan chan bool),
    57  		waitMinionTickets: make(chan chan bool),
    58  	}
    59  	go func() {
    60  		defer t.tomb.Done()
    61  		defer func() {
    62  			for _, ticketCh := range t.waitingLeader {
    63  				close(ticketCh)
    64  			}
    65  			for _, ticketCh := range t.waitingMinion {
    66  				close(ticketCh)
    67  			}
    68  		}()
    69  		err := t.loop()
    70  		// TODO: jam 2015-04-02 is this the most elegant way to make
    71  		// sure we shutdown cleanly? Essentially the lowest level sees
    72  		// that we are dying, and propagates an ErrDying up to us so
    73  		// that we shut down, which we then are passing back into
    74  		// Tomb.Kill().
    75  		// Tomb.Kill() special cases the exact object ErrDying, and has
    76  		// no idea about errors.Cause and the general errors.Trace
    77  		// mechanisms that we use.
    78  		// So we explicitly unwrap before calling tomb.Kill() else
    79  		// tomb.Stop() thinks that we have a genuine error.
    80  		switch cause := errors.Cause(err); cause {
    81  		case tomb.ErrDying:
    82  			err = cause
    83  		}
    84  		t.tomb.Kill(err)
    85  	}()
    86  	return t
    87  }
    88  
    89  // Kill is part of the worker.Worker interface.
    90  func (t *Tracker) Kill() {
    91  	t.tomb.Kill(nil)
    92  }
    93  
    94  // Wait is part of the worker.Worker interface.
    95  func (t *Tracker) Wait() error {
    96  	return t.tomb.Wait()
    97  }
    98  
    99  // ApplicationName is part of the leadership.Tracker interface.
   100  func (t *Tracker) ApplicationName() string {
   101  	return t.applicationName
   102  }
   103  
   104  // ClaimDuration is part of the leadership.Tracker interface.
   105  func (t *Tracker) ClaimDuration() time.Duration {
   106  	return t.duration
   107  }
   108  
   109  // ClaimLeader is part of the leadership.Tracker interface.
   110  func (t *Tracker) ClaimLeader() leadership.Ticket {
   111  	return t.submit(t.claimTickets)
   112  }
   113  
   114  // WaitLeader is part of the leadership.Tracker interface.
   115  func (t *Tracker) WaitLeader() leadership.Ticket {
   116  	return t.submit(t.waitLeaderTickets)
   117  }
   118  
   119  // WaitMinion is part of the leadership.Tracker interface.
   120  func (t *Tracker) WaitMinion() leadership.Ticket {
   121  	return t.submit(t.waitMinionTickets)
   122  }
   123  
   124  func (t *Tracker) loop() error {
   125  	logger.Debugf("%s making initial claim for %s leadership", t.unitName, t.applicationName)
   126  	if err := t.refresh(); err != nil {
   127  		return errors.Trace(err)
   128  	}
   129  	for {
   130  		select {
   131  		case <-t.tomb.Dying():
   132  			return tomb.ErrDying
   133  		case <-t.claimLease:
   134  			logger.Debugf("%s claiming lease for %s leadership", t.unitName, t.applicationName)
   135  			t.claimLease = nil
   136  			if err := t.refresh(); err != nil {
   137  				return errors.Trace(err)
   138  			}
   139  		case <-t.renewLease:
   140  			logger.Debugf("%s renewing lease for %s leadership", t.unitName, t.applicationName)
   141  			t.renewLease = nil
   142  			if err := t.refresh(); err != nil {
   143  				return errors.Trace(err)
   144  			}
   145  		case ticketCh := <-t.claimTickets:
   146  			logger.Debugf("%s got claim request for %s leadership", t.unitName, t.applicationName)
   147  			if err := t.resolveClaim(ticketCh); err != nil {
   148  				return errors.Trace(err)
   149  			}
   150  		case ticketCh := <-t.waitLeaderTickets:
   151  			logger.Debugf("%s got wait request for %s leadership", t.unitName, t.applicationName)
   152  			if err := t.resolveWaitLeader(ticketCh); err != nil {
   153  				return errors.Trace(err)
   154  			}
   155  		case ticketCh := <-t.waitMinionTickets:
   156  			logger.Debugf("%s got wait request for %s leadership loss", t.unitName, t.applicationName)
   157  			if err := t.resolveWaitMinion(ticketCh); err != nil {
   158  				return errors.Trace(err)
   159  			}
   160  		}
   161  	}
   162  }
   163  
   164  // refresh makes a leadership request, and updates Tracker state to conform to
   165  // latest known reality.
   166  func (t *Tracker) refresh() error {
   167  	logger.Debugf("checking %s for %s leadership", t.unitName, t.applicationName)
   168  	leaseDuration := 2 * t.duration
   169  	untilTime := t.clock.Now().Add(leaseDuration)
   170  	err := t.claimer.ClaimLeadership(t.applicationName, t.unitName, leaseDuration)
   171  	switch {
   172  	case err == nil:
   173  		return t.setLeader(untilTime)
   174  	case errors.Cause(err) == leadership.ErrClaimDenied:
   175  		return t.setMinion()
   176  	}
   177  	return errors.Annotatef(err, "leadership failure")
   178  }
   179  
   180  // setLeader arranges for lease renewal.
   181  func (t *Tracker) setLeader(untilTime time.Time) error {
   182  	logger.Debugf("%s confirmed for %s leadership until %s", t.unitName, t.applicationName, untilTime)
   183  	renewTime := untilTime.Add(-t.duration)
   184  	logger.Infof("%s will renew %s leadership at %s", t.unitName, t.applicationName, renewTime)
   185  	t.isMinion = false
   186  	t.claimLease = nil
   187  	t.renewLease = t.clock.After(renewTime.Sub(t.clock.Now()))
   188  
   189  	for len(t.waitingLeader) > 0 {
   190  		logger.Debugf("notifying %s ticket of impending %s leadership", t.unitName, t.applicationName)
   191  		var ticketCh chan bool
   192  		ticketCh, t.waitingLeader = t.waitingLeader[0], t.waitingLeader[1:]
   193  		defer close(ticketCh)
   194  		if err := t.sendTrue(ticketCh); err != nil {
   195  			return errors.Trace(err)
   196  		}
   197  	}
   198  	return nil
   199  }
   200  
   201  // setMinion arranges for lease acquisition when there's an opportunity.
   202  func (t *Tracker) setMinion() error {
   203  	logger.Infof("%s leadership for %s denied", t.applicationName, t.unitName)
   204  	t.isMinion = true
   205  	t.renewLease = nil
   206  	if t.claimLease == nil {
   207  		t.claimLease = make(chan struct{})
   208  		go func() {
   209  			defer close(t.claimLease)
   210  			logger.Debugf("%s waiting for %s leadership release", t.unitName, t.applicationName)
   211  			err := t.claimer.BlockUntilLeadershipReleased(t.applicationName)
   212  			if err != nil {
   213  				logger.Debugf("error while %s waiting for %s leadership release: %v", t.unitName, t.applicationName, err)
   214  			}
   215  			// We don't need to do anything else with the error, because we just
   216  			// close the claimLease channel and trigger a leadership claim on the
   217  			// main loop; if anything's gone seriously wrong we'll find out right
   218  			// away and shut down anyway. (And if this goroutine outlives the
   219  			// Tracker, it keeps it around as a zombie, but I don't see a way
   220  			// around that...)
   221  		}()
   222  	}
   223  
   224  	for len(t.waitingMinion) > 0 {
   225  		logger.Debugf("notifying %s ticket of impending loss of %s leadership", t.unitName, t.applicationName)
   226  		var ticketCh chan bool
   227  		ticketCh, t.waitingMinion = t.waitingMinion[0], t.waitingMinion[1:]
   228  		defer close(ticketCh)
   229  		if err := t.sendTrue(ticketCh); err != nil {
   230  			return errors.Trace(err)
   231  		}
   232  	}
   233  	return nil
   234  }
   235  
   236  // isLeader returns true if leadership is guaranteed for the Tracker's duration.
   237  func (t *Tracker) isLeader() (bool, error) {
   238  	if !t.isMinion {
   239  		// Last time we looked, we were leader.
   240  		select {
   241  		case <-t.tomb.Dying():
   242  			return false, errors.Trace(tomb.ErrDying)
   243  		case <-t.renewLease:
   244  			logger.Debugf("%s renewing lease for %s leadership", t.unitName, t.applicationName)
   245  			t.renewLease = nil
   246  			if err := t.refresh(); err != nil {
   247  				return false, errors.Trace(err)
   248  			}
   249  		default:
   250  			logger.Debugf("%s still has %s leadership", t.unitName, t.applicationName)
   251  		}
   252  	}
   253  	return !t.isMinion, nil
   254  }
   255  
   256  // resolveClaim will send true on the supplied channel if leadership can be
   257  // successfully verified, and will always close it whether or not it sent.
   258  func (t *Tracker) resolveClaim(ticketCh chan bool) error {
   259  	logger.Debugf("resolving %s leadership ticket for %s...", t.applicationName, t.unitName)
   260  	defer close(ticketCh)
   261  	if leader, err := t.isLeader(); err != nil {
   262  		return errors.Trace(err)
   263  	} else if !leader {
   264  		logger.Debugf("%s is not %s leader", t.unitName, t.applicationName)
   265  		return nil
   266  	}
   267  	logger.Debugf("confirming %s leadership for %s", t.applicationName, t.unitName)
   268  	return t.sendTrue(ticketCh)
   269  }
   270  
   271  // resolveWaitLeader will send true on the supplied channel if leadership can be
   272  // guaranteed for the Tracker's duration. It will then close the channel. If
   273  // leadership cannot be guaranteed, the channel is left untouched until either
   274  // the termination of the Tracker or the next invocation of setLeader; at which
   275  // point true is sent if applicable, and the channel is closed.
   276  func (t *Tracker) resolveWaitLeader(ticketCh chan bool) error {
   277  	var dontClose bool
   278  	defer func() {
   279  		if !dontClose {
   280  			close(ticketCh)
   281  		}
   282  	}()
   283  
   284  	if leader, err := t.isLeader(); err != nil {
   285  		return errors.Trace(err)
   286  	} else if leader {
   287  		logger.Debugf("reporting %s leadership for %s", t.applicationName, t.unitName)
   288  		return t.sendTrue(ticketCh)
   289  	}
   290  
   291  	logger.Debugf("waiting for %s to attain %s leadership", t.unitName, t.applicationName)
   292  	t.waitingLeader = append(t.waitingLeader, ticketCh)
   293  	dontClose = true
   294  	return nil
   295  }
   296  
   297  // resolveWaitMinion will close the supplied channel as soon as leadership cannot
   298  // be guaranteed beyond the Tracker's duration.
   299  func (t *Tracker) resolveWaitMinion(ticketCh chan bool) error {
   300  	var dontClose bool
   301  	defer func() {
   302  		if !dontClose {
   303  			close(ticketCh)
   304  		}
   305  	}()
   306  
   307  	if leader, err := t.isLeader(); err != nil {
   308  		return errors.Trace(err)
   309  	} else if leader {
   310  		logger.Debugf("waiting for %s to lose %s leadership", t.unitName, t.applicationName)
   311  		t.waitingMinion = append(t.waitingMinion, ticketCh)
   312  		dontClose = true
   313  	} else {
   314  		logger.Debugf("reporting %s leadership loss for %s", t.applicationName, t.unitName)
   315  	}
   316  	return nil
   317  
   318  }
   319  
   320  func (t *Tracker) sendTrue(ticketCh chan bool) error {
   321  	select {
   322  	case <-t.tomb.Dying():
   323  		return tomb.ErrDying
   324  	case ticketCh <- true:
   325  		return nil
   326  	}
   327  }
   328  
   329  func (t *Tracker) submit(tickets chan chan bool) leadership.Ticket {
   330  	ticketCh := make(chan bool, 1)
   331  	select {
   332  	case <-t.tomb.Dying():
   333  		close(ticketCh)
   334  	case tickets <- ticketCh:
   335  	}
   336  	ticket := &ticket{
   337  		ch:    ticketCh,
   338  		ready: make(chan struct{}),
   339  	}
   340  	go ticket.run()
   341  	return ticket
   342  }
   343  
   344  // ticket is used by Tracker to communicate leadership status back to a client.
   345  type ticket struct {
   346  	ch      chan bool
   347  	ready   chan struct{}
   348  	success bool
   349  }
   350  
   351  func (t *ticket) run() {
   352  	defer close(t.ready)
   353  	// This is only safe/sane because the Tracker promises to close all pending
   354  	// ticket channels when it shuts down.
   355  	if <-t.ch {
   356  		t.success = true
   357  	}
   358  }
   359  
   360  // Ready is part of the leadership.Ticket interface.
   361  func (t *ticket) Ready() <-chan struct{} {
   362  	return t.ready
   363  }
   364  
   365  // Wait is part of the leadership.Ticket interface.
   366  func (t *ticket) Wait() bool {
   367  	<-t.ready
   368  	return t.success
   369  }