github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/worker/leadership/tracker.go (about)

     1  // Copyright 2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package leadership
     5  
     6  import (
     7  	"time"
     8  
     9  	"github.com/juju/clock"
    10  	"github.com/juju/errors"
    11  	"github.com/juju/loggo"
    12  	"gopkg.in/juju/names.v2"
    13  	"gopkg.in/tomb.v2"
    14  
    15  	"github.com/juju/juju/core/leadership"
    16  )
    17  
    18  var logger = loggo.GetLogger("juju.worker.leadership")
    19  
    20  type Tracker struct {
    21  	tomb            tomb.Tomb
    22  	claimer         leadership.Claimer
    23  	unitName        string
    24  	applicationName string
    25  	clock           clock.Clock
    26  	duration        time.Duration
    27  	isMinion        bool
    28  
    29  	claimLease        chan error
    30  	renewLease        <-chan time.Time
    31  	claimTickets      chan chan bool
    32  	waitLeaderTickets chan chan bool
    33  	waitMinionTickets chan chan bool
    34  	waitingLeader     []chan bool
    35  	waitingMinion     []chan bool
    36  }
    37  
    38  // NewTracker returns a *Tracker that attempts to claim and retain service
    39  // leadership for the supplied unit. It will claim leadership for twice the
    40  // supplied duration, and once it's leader it will renew leadership every
    41  // time the duration elapses.
    42  // Thus, successful leadership claims on the resulting Tracker will guarantee
    43  // leadership for the duration supplied here without generating additional
    44  // calls to the supplied manager (which may very well be on the other side of
    45  // a network connection).
    46  func NewTracker(tag names.UnitTag, claimer leadership.Claimer, clock clock.Clock, duration time.Duration) *Tracker {
    47  	unitName := tag.Id()
    48  	serviceName, _ := names.UnitApplication(unitName)
    49  	t := &Tracker{
    50  		unitName:          unitName,
    51  		applicationName:   serviceName,
    52  		claimer:           claimer,
    53  		clock:             clock,
    54  		duration:          duration,
    55  		claimTickets:      make(chan chan bool),
    56  		waitLeaderTickets: make(chan chan bool),
    57  		waitMinionTickets: make(chan chan bool),
    58  		isMinion:          true,
    59  	}
    60  	t.tomb.Go(func() error {
    61  		defer func() {
    62  			for _, ticketCh := range t.waitingLeader {
    63  				close(ticketCh)
    64  			}
    65  			for _, ticketCh := range t.waitingMinion {
    66  				close(ticketCh)
    67  			}
    68  			if t.claimLease != nil {
    69  				// wait for the goroutine started
    70  				// by setLeader to exit.
    71  				<-t.claimLease
    72  			}
    73  		}()
    74  		err := t.loop()
    75  		// TODO: jam 2015-04-02 is this the most elegant way to make
    76  		// sure we shutdown cleanly? Essentially the lowest level sees
    77  		// that we are dying, and propagates an ErrDying up to us so
    78  		// that we shut down, which we then are passing back into
    79  		// Tomb.Kill().
    80  		// Tomb.Kill() special cases the exact object ErrDying, and has
    81  		// no idea about errors.Cause and the general errors.Trace
    82  		// mechanisms that we use.
    83  		// So we explicitly unwrap before calling tomb.Kill() else
    84  		// tomb.Stop() thinks that we have a genuine error.
    85  		switch cause := errors.Cause(err); cause {
    86  		case tomb.ErrDying:
    87  			err = cause
    88  		}
    89  		return err
    90  	})
    91  	return t
    92  }
    93  
    94  // Kill is part of the worker.Worker interface.
    95  func (t *Tracker) Kill() {
    96  	t.tomb.Kill(nil)
    97  }
    98  
    99  // Wait is part of the worker.Worker interface.
   100  func (t *Tracker) Wait() error {
   101  	return t.tomb.Wait()
   102  }
   103  
   104  // ApplicationName is part of the leadership.Tracker interface.
   105  func (t *Tracker) ApplicationName() string {
   106  	return t.applicationName
   107  }
   108  
   109  // ClaimDuration is part of the leadership.Tracker interface.
   110  func (t *Tracker) ClaimDuration() time.Duration {
   111  	return t.duration
   112  }
   113  
   114  // ClaimLeader is part of the leadership.Tracker interface.
   115  func (t *Tracker) ClaimLeader() leadership.Ticket {
   116  	return t.submit(t.claimTickets)
   117  }
   118  
   119  // WaitLeader is part of the leadership.Tracker interface.
   120  func (t *Tracker) WaitLeader() leadership.Ticket {
   121  	return t.submit(t.waitLeaderTickets)
   122  }
   123  
   124  // WaitMinion is part of the leadership.Tracker interface.
   125  func (t *Tracker) WaitMinion() leadership.Ticket {
   126  	return t.submit(t.waitMinionTickets)
   127  }
   128  
   129  func (t *Tracker) loop() error {
   130  	logger.Debugf("%s making initial claim for %s leadership", t.unitName, t.applicationName)
   131  	if err := t.refresh(); err != nil {
   132  		return errors.Trace(err)
   133  	}
   134  	for {
   135  		select {
   136  		case <-t.tomb.Dying():
   137  			return tomb.ErrDying
   138  		case err := <-t.claimLease:
   139  			t.claimLease = nil
   140  			if errors.Cause(err) == leadership.ErrBlockCancelled {
   141  				// BlockUntilLeadershipReleased was cancelled,
   142  				// which means that the tracker is terminating.
   143  				continue
   144  			} else if err != nil {
   145  				return errors.Annotatef(err,
   146  					"error while %s waiting for %s leadership release",
   147  					t.unitName, t.applicationName,
   148  				)
   149  			}
   150  			logger.Tracef("%s claiming lease for %s leadership", t.unitName, t.applicationName)
   151  			if err := t.refresh(); err != nil {
   152  				return errors.Trace(err)
   153  			}
   154  		case <-t.renewLease:
   155  			logger.Tracef("%s renewing lease for %s leadership", t.unitName, t.applicationName)
   156  			t.renewLease = nil
   157  			if err := t.refresh(); err != nil {
   158  				return errors.Trace(err)
   159  			}
   160  		case ticketCh := <-t.claimTickets:
   161  			logger.Tracef("%s got claim request for %s leadership", t.unitName, t.applicationName)
   162  			if err := t.resolveClaim(ticketCh); err != nil {
   163  				return errors.Trace(err)
   164  			}
   165  		case ticketCh := <-t.waitLeaderTickets:
   166  			logger.Tracef("%s got wait request for %s leadership", t.unitName, t.applicationName)
   167  			if err := t.resolveWaitLeader(ticketCh); err != nil {
   168  				return errors.Trace(err)
   169  			}
   170  		case ticketCh := <-t.waitMinionTickets:
   171  			logger.Tracef("%s got wait request for %s leadership loss", t.unitName, t.applicationName)
   172  			if err := t.resolveWaitMinion(ticketCh); err != nil {
   173  				return errors.Trace(err)
   174  			}
   175  		}
   176  	}
   177  }
   178  
   179  // refresh makes a leadership request, and updates Tracker state to conform to
   180  // latest known reality.
   181  func (t *Tracker) refresh() error {
   182  	logger.Tracef("checking %s for %s leadership", t.unitName, t.applicationName)
   183  	leaseDuration := 2 * t.duration
   184  	untilTime := t.clock.Now().Add(leaseDuration)
   185  	err := t.claimer.ClaimLeadership(t.applicationName, t.unitName, leaseDuration)
   186  	switch {
   187  	case err == nil:
   188  		return t.setLeader(untilTime)
   189  	case errors.Cause(err) == leadership.ErrClaimDenied:
   190  		return t.setMinion()
   191  	}
   192  	return errors.Annotatef(err, "leadership failure")
   193  }
   194  
   195  // setLeader arranges for lease renewal.
   196  func (t *Tracker) setLeader(untilTime time.Time) error {
   197  	if t.isMinion {
   198  		// If we were a minion, we're now the leader, so we can record the transition.
   199  		logger.Infof("%s promoted to leadership of %s", t.unitName, t.applicationName)
   200  	}
   201  	logger.Tracef("%s confirmed for %s leadership until %s", t.unitName, t.applicationName, untilTime)
   202  	renewTime := untilTime.Add(-t.duration)
   203  	logger.Tracef("%s will renew %s leadership at %s", t.unitName, t.applicationName, renewTime)
   204  	t.isMinion = false
   205  	t.claimLease = nil
   206  	t.renewLease = t.clock.After(renewTime.Sub(t.clock.Now()))
   207  
   208  	for len(t.waitingLeader) > 0 {
   209  		logger.Tracef("notifying %s ticket of impending %s leadership", t.unitName, t.applicationName)
   210  		var ticketCh chan bool
   211  		ticketCh, t.waitingLeader = t.waitingLeader[0], t.waitingLeader[1:]
   212  		defer close(ticketCh)
   213  		if err := t.sendTrue(ticketCh); err != nil {
   214  			return errors.Trace(err)
   215  		}
   216  	}
   217  	return nil
   218  }
   219  
   220  // setMinion arranges for lease acquisition when there's an opportunity.
   221  func (t *Tracker) setMinion() error {
   222  	logger.Infof("%s leadership for %s denied", t.applicationName, t.unitName)
   223  	t.isMinion = true
   224  	t.renewLease = nil
   225  	if t.claimLease == nil {
   226  		t.claimLease = make(chan error, 1)
   227  		go func() {
   228  			defer close(t.claimLease)
   229  			logger.Debugf("%s waiting for %s leadership release", t.unitName, t.applicationName)
   230  			err := t.claimer.BlockUntilLeadershipReleased(t.applicationName, t.tomb.Dying())
   231  			t.claimLease <- err
   232  		}()
   233  	}
   234  
   235  	for len(t.waitingMinion) > 0 {
   236  		logger.Debugf("notifying %s ticket of impending loss of %s leadership", t.unitName, t.applicationName)
   237  		var ticketCh chan bool
   238  		ticketCh, t.waitingMinion = t.waitingMinion[0], t.waitingMinion[1:]
   239  		defer close(ticketCh)
   240  		if err := t.sendTrue(ticketCh); err != nil {
   241  			return errors.Trace(err)
   242  		}
   243  	}
   244  	return nil
   245  }
   246  
   247  // isLeader returns true if leadership is guaranteed for the Tracker's duration.
   248  func (t *Tracker) isLeader() (bool, error) {
   249  	if !t.isMinion {
   250  		// Last time we looked, we were leader.
   251  		select {
   252  		case <-t.tomb.Dying():
   253  			return false, errors.Trace(tomb.ErrDying)
   254  		case <-t.renewLease:
   255  			logger.Tracef("%s renewing lease for %s leadership", t.unitName, t.applicationName)
   256  			t.renewLease = nil
   257  			if err := t.refresh(); err != nil {
   258  				return false, errors.Trace(err)
   259  			}
   260  		default:
   261  			logger.Tracef("%s still has %s leadership", t.unitName, t.applicationName)
   262  		}
   263  	}
   264  	return !t.isMinion, nil
   265  }
   266  
   267  // resolveClaim will send true on the supplied channel if leadership can be
   268  // successfully verified, and will always close it whether or not it sent.
   269  func (t *Tracker) resolveClaim(ticketCh chan bool) error {
   270  	logger.Tracef("resolving %s leadership ticket for %s...", t.applicationName, t.unitName)
   271  	defer close(ticketCh)
   272  	if leader, err := t.isLeader(); err != nil {
   273  		return errors.Trace(err)
   274  	} else if !leader {
   275  		logger.Debugf("%s is not %s leader", t.unitName, t.applicationName)
   276  		return nil
   277  	}
   278  	logger.Tracef("confirming %s leadership for %s", t.applicationName, t.unitName)
   279  	return t.sendTrue(ticketCh)
   280  }
   281  
   282  // resolveWaitLeader will send true on the supplied channel if leadership can be
   283  // guaranteed for the Tracker's duration. It will then close the channel. If
   284  // leadership cannot be guaranteed, the channel is left untouched until either
   285  // the termination of the Tracker or the next invocation of setLeader; at which
   286  // point true is sent if applicable, and the channel is closed.
   287  func (t *Tracker) resolveWaitLeader(ticketCh chan bool) error {
   288  	var dontClose bool
   289  	defer func() {
   290  		if !dontClose {
   291  			close(ticketCh)
   292  		}
   293  	}()
   294  
   295  	if leader, err := t.isLeader(); err != nil {
   296  		return errors.Trace(err)
   297  	} else if leader {
   298  		logger.Tracef("reporting %s leadership for %s", t.applicationName, t.unitName)
   299  		return t.sendTrue(ticketCh)
   300  	}
   301  
   302  	logger.Tracef("waiting for %s to attain %s leadership", t.unitName, t.applicationName)
   303  	t.waitingLeader = append(t.waitingLeader, ticketCh)
   304  	dontClose = true
   305  	return nil
   306  }
   307  
   308  // resolveWaitMinion will close the supplied channel as soon as leadership cannot
   309  // be guaranteed beyond the Tracker's duration.
   310  func (t *Tracker) resolveWaitMinion(ticketCh chan bool) error {
   311  	var dontClose bool
   312  	defer func() {
   313  		if !dontClose {
   314  			close(ticketCh)
   315  		}
   316  	}()
   317  
   318  	if leader, err := t.isLeader(); err != nil {
   319  		return errors.Trace(err)
   320  	} else if leader {
   321  		logger.Tracef("waiting for %s to lose %s leadership", t.unitName, t.applicationName)
   322  		t.waitingMinion = append(t.waitingMinion, ticketCh)
   323  		dontClose = true
   324  	} else {
   325  		logger.Tracef("reporting %s leadership loss for %s", t.applicationName, t.unitName)
   326  	}
   327  	return nil
   328  
   329  }
   330  
   331  func (t *Tracker) sendTrue(ticketCh chan bool) error {
   332  	select {
   333  	case <-t.tomb.Dying():
   334  		return tomb.ErrDying
   335  	case ticketCh <- true:
   336  		return nil
   337  	}
   338  }
   339  
   340  func (t *Tracker) submit(tickets chan chan bool) leadership.Ticket {
   341  	ticketCh := make(chan bool, 1)
   342  	select {
   343  	case <-t.tomb.Dying():
   344  		close(ticketCh)
   345  	case tickets <- ticketCh:
   346  	}
   347  	ticket := &ticket{
   348  		ch:    ticketCh,
   349  		ready: make(chan struct{}),
   350  	}
   351  	go ticket.run()
   352  	return ticket
   353  }
   354  
   355  // ticket is used by Tracker to communicate leadership status back to a client.
   356  type ticket struct {
   357  	ch      chan bool
   358  	ready   chan struct{}
   359  	success bool
   360  }
   361  
   362  func (t *ticket) run() {
   363  	defer close(t.ready)
   364  	// This is only safe/sane because the Tracker promises to close all pending
   365  	// ticket channels when it shuts down.
   366  	if <-t.ch {
   367  		t.success = true
   368  	}
   369  }
   370  
   371  // Ready is part of the leadership.Ticket interface.
   372  func (t *ticket) Ready() <-chan struct{} {
   373  	return t.ready
   374  }
   375  
   376  // Wait is part of the leadership.Ticket interface.
   377  func (t *ticket) Wait() bool {
   378  	<-t.ready
   379  	return t.success
   380  }