github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/leadership/tracker.go (about)

     1  // Copyright 2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package leadership
     5  
     6  import (
     7  	"time"
     8  
     9  	"github.com/juju/clock"
    10  	"github.com/juju/errors"
    11  	"github.com/juju/loggo"
    12  	"github.com/juju/names/v5"
    13  	"gopkg.in/tomb.v2"
    14  
    15  	"github.com/juju/juju/core/leadership"
    16  )
    17  
    18  var logger = loggo.GetLogger("juju.worker.leadership")
    19  
    20  type Tracker struct {
    21  	tomb            tomb.Tomb
    22  	claimer         leadership.Claimer
    23  	unitName        string
    24  	applicationName string
    25  	clock           clock.Clock
    26  	duration        time.Duration
    27  	isMinion        bool
    28  
    29  	claimLease        chan error
    30  	renewLease        <-chan time.Time
    31  	claimTickets      chan chan bool
    32  	waitLeaderTickets chan chan bool
    33  	waitMinionTickets chan chan bool
    34  	waitingLeader     []chan bool
    35  	waitingMinion     []chan bool
    36  }
    37  
    38  // NewTracker returns a *Tracker that attempts to claim and retain application
    39  // leadership for the supplied unit. It will claim leadership for twice the
    40  // supplied duration, and once it's leader it will renew leadership every
    41  // time the duration elapses.
    42  // Thus, successful leadership claims on the resulting Tracker will guarantee
    43  // leadership for the duration supplied here without generating additional
    44  // calls to the supplied manager (which may very well be on the other side of
    45  // a network connection).
    46  func NewTracker(tag names.UnitTag, claimer leadership.Claimer, clock clock.Clock, duration time.Duration) *Tracker {
    47  	unitName := tag.Id()
    48  	applicationName, _ := names.UnitApplication(unitName)
    49  	t := &Tracker{
    50  		unitName:          unitName,
    51  		applicationName:   applicationName,
    52  		claimer:           claimer,
    53  		clock:             clock,
    54  		duration:          duration,
    55  		claimTickets:      make(chan chan bool),
    56  		waitLeaderTickets: make(chan chan bool),
    57  		waitMinionTickets: make(chan chan bool),
    58  		isMinion:          true,
    59  	}
    60  	t.tomb.Go(func() error {
    61  		defer func() {
    62  			for _, ticketCh := range t.waitingLeader {
    63  				close(ticketCh)
    64  			}
    65  			for _, ticketCh := range t.waitingMinion {
    66  				close(ticketCh)
    67  			}
    68  		}()
    69  		err := t.loop()
    70  		// TODO: jam 2015-04-02 is this the most elegant way to make
    71  		// sure we shutdown cleanly? Essentially the lowest level sees
    72  		// that we are dying, and propagates an ErrDying up to us so
    73  		// that we shut down, which we then are passing back into
    74  		// Tomb.Kill().
    75  		// Tomb.Kill() special cases the exact object ErrDying, and has
    76  		// no idea about errors.Cause and the general errors.Trace
    77  		// mechanisms that we use.
    78  		// So we explicitly unwrap before calling tomb.Kill() else
    79  		// tomb.Stop() thinks that we have a genuine error.
    80  		switch cause := errors.Cause(err); cause {
    81  		case tomb.ErrDying:
    82  			err = cause
    83  		}
    84  		return err
    85  	})
    86  	return t
    87  }
    88  
    89  // Kill is part of the worker.Worker interface.
    90  func (t *Tracker) Kill() {
    91  	t.tomb.Kill(nil)
    92  }
    93  
    94  // Wait is part of the worker.Worker interface.
    95  func (t *Tracker) Wait() error {
    96  	return t.tomb.Wait()
    97  }
    98  
    99  // ApplicationName is part of the leadership.Tracker interface.
   100  func (t *Tracker) ApplicationName() string {
   101  	return t.applicationName
   102  }
   103  
   104  // ClaimDuration is part of the leadership.Tracker interface.
   105  func (t *Tracker) ClaimDuration() time.Duration {
   106  	return t.duration
   107  }
   108  
   109  // ClaimLeader is part of the leadership.Tracker interface.
   110  func (t *Tracker) ClaimLeader() leadership.Ticket {
   111  	return t.submit(t.claimTickets)
   112  }
   113  
   114  // WaitLeader is part of the leadership.Tracker interface.
   115  func (t *Tracker) WaitLeader() leadership.Ticket {
   116  	return t.submit(t.waitLeaderTickets)
   117  }
   118  
   119  // WaitMinion is part of the leadership.Tracker interface.
   120  func (t *Tracker) WaitMinion() leadership.Ticket {
   121  	return t.submit(t.waitMinionTickets)
   122  }
   123  
   124  func (t *Tracker) loop() error {
   125  	logger.Debugf("%s making initial claim for %s leadership", t.unitName, t.applicationName)
   126  	if err := t.refresh(); err != nil {
   127  		return errors.Trace(err)
   128  	}
   129  	for {
   130  		select {
   131  		case <-t.tomb.Dying():
   132  			return tomb.ErrDying
   133  		case err, ok := <-t.claimLease:
   134  			t.claimLease = nil
   135  			if errors.Cause(err) == leadership.ErrBlockCancelled || !ok {
   136  				// BlockUntilLeadershipReleased was cancelled,
   137  				// which means that the tracker is terminating.
   138  				continue
   139  			} else if err != nil {
   140  				return errors.Annotatef(err,
   141  					"error while %s waiting for %s leadership release",
   142  					t.unitName, t.applicationName,
   143  				)
   144  			}
   145  			logger.Tracef("%s claiming lease for %s leadership", t.unitName, t.applicationName)
   146  			if err := t.refresh(); err != nil {
   147  				return errors.Trace(err)
   148  			}
   149  		case <-t.renewLease:
   150  			logger.Tracef("%s renewing lease for %s leadership", t.unitName, t.applicationName)
   151  			t.renewLease = nil
   152  			if err := t.refresh(); err != nil {
   153  				return errors.Trace(err)
   154  			}
   155  		case ticketCh := <-t.claimTickets:
   156  			logger.Tracef("%s got claim request for %s leadership", t.unitName, t.applicationName)
   157  			if err := t.resolveClaim(ticketCh); err != nil {
   158  				return errors.Trace(err)
   159  			}
   160  		case ticketCh := <-t.waitLeaderTickets:
   161  			logger.Tracef("%s got wait request for %s leadership", t.unitName, t.applicationName)
   162  			if err := t.resolveWaitLeader(ticketCh); err != nil {
   163  				return errors.Trace(err)
   164  			}
   165  		case ticketCh := <-t.waitMinionTickets:
   166  			logger.Tracef("%s got wait request for %s leadership loss", t.unitName, t.applicationName)
   167  			if err := t.resolveWaitMinion(ticketCh); err != nil {
   168  				return errors.Trace(err)
   169  			}
   170  		}
   171  	}
   172  }
   173  
   174  // refresh makes a leadership request, and updates Tracker state to conform to
   175  // latest known reality.
   176  func (t *Tracker) refresh() error {
   177  	logger.Tracef("checking %s for %s leadership", t.unitName, t.applicationName)
   178  	leaseDuration := 2 * t.duration
   179  	untilTime := t.clock.Now().Add(leaseDuration)
   180  	err := t.claimer.ClaimLeadership(t.applicationName, t.unitName, leaseDuration)
   181  	switch {
   182  	case err == nil:
   183  		return t.setLeader(untilTime)
   184  	case errors.Cause(err) == leadership.ErrClaimDenied:
   185  		return t.setMinion()
   186  	}
   187  	return errors.Annotatef(err, "leadership failure")
   188  }
   189  
   190  // setLeader arranges for lease renewal.
   191  func (t *Tracker) setLeader(untilTime time.Time) error {
   192  	if t.isMinion {
   193  		// If we were a minion, we're now the leader, so we can record the transition.
   194  		logger.Infof("%s promoted to leadership of %s", t.unitName, t.applicationName)
   195  	}
   196  	logger.Tracef("%s confirmed for %s leadership until %s", t.unitName, t.applicationName, untilTime)
   197  	renewTime := untilTime.Add(-t.duration)
   198  	logger.Tracef("%s will renew %s leadership at %s", t.unitName, t.applicationName, renewTime)
   199  	t.isMinion = false
   200  	t.claimLease = nil
   201  	t.renewLease = t.clock.After(renewTime.Sub(t.clock.Now()))
   202  
   203  	for len(t.waitingLeader) > 0 {
   204  		logger.Tracef("notifying %s ticket of impending %s leadership", t.unitName, t.applicationName)
   205  		var ticketCh chan bool
   206  		ticketCh, t.waitingLeader = t.waitingLeader[0], t.waitingLeader[1:]
   207  		defer close(ticketCh)
   208  		if err := t.sendTrue(ticketCh); err != nil {
   209  			return errors.Trace(err)
   210  		}
   211  	}
   212  	return nil
   213  }
   214  
   215  // setMinion arranges for lease acquisition when there's an opportunity.
   216  func (t *Tracker) setMinion() error {
   217  	logger.Infof("%s leadership for %s denied", t.applicationName, t.unitName)
   218  	t.isMinion = true
   219  	t.renewLease = nil
   220  	if t.claimLease == nil {
   221  		t.claimLease = make(chan error, 1)
   222  		go func() {
   223  			defer close(t.claimLease)
   224  			logger.Debugf("%s waiting for %s leadership release", t.unitName, t.applicationName)
   225  			err := t.claimer.BlockUntilLeadershipReleased(t.applicationName, t.tomb.Dying())
   226  			if err != nil {
   227  				logger.Debugf("%s waiting for %s leadership release gave err: %s", t.unitName, t.applicationName, err)
   228  			}
   229  			select {
   230  			case t.claimLease <- err:
   231  			case <-t.tomb.Dying():
   232  			}
   233  		}()
   234  	}
   235  
   236  	for len(t.waitingMinion) > 0 {
   237  		logger.Debugf("notifying %s ticket of impending loss of %s leadership", t.unitName, t.applicationName)
   238  		var ticketCh chan bool
   239  		ticketCh, t.waitingMinion = t.waitingMinion[0], t.waitingMinion[1:]
   240  		defer close(ticketCh)
   241  		if err := t.sendTrue(ticketCh); err != nil {
   242  			return errors.Trace(err)
   243  		}
   244  	}
   245  	return nil
   246  }
   247  
   248  // isLeader returns true if leadership is guaranteed for the Tracker's duration.
   249  func (t *Tracker) isLeader() (bool, error) {
   250  	if !t.isMinion {
   251  		// Last time we looked, we were leader.
   252  		select {
   253  		case <-t.tomb.Dying():
   254  			return false, errors.Trace(tomb.ErrDying)
   255  		case <-t.renewLease:
   256  			logger.Tracef("%s renewing lease for %s leadership", t.unitName, t.applicationName)
   257  			t.renewLease = nil
   258  			if err := t.refresh(); err != nil {
   259  				return false, errors.Trace(err)
   260  			}
   261  		default:
   262  			logger.Tracef("%s still has %s leadership", t.unitName, t.applicationName)
   263  		}
   264  	}
   265  	return !t.isMinion, nil
   266  }
   267  
   268  // resolveClaim will send true on the supplied channel if leadership can be
   269  // successfully verified, and will always close it whether or not it sent.
   270  func (t *Tracker) resolveClaim(ticketCh chan bool) error {
   271  	logger.Tracef("resolving %s leadership ticket for %s...", t.applicationName, t.unitName)
   272  	defer close(ticketCh)
   273  	if leader, err := t.isLeader(); err != nil {
   274  		return errors.Trace(err)
   275  	} else if !leader {
   276  		logger.Debugf("%s is not %s leader", t.unitName, t.applicationName)
   277  		return nil
   278  	}
   279  	logger.Tracef("confirming %s leadership for %s", t.applicationName, t.unitName)
   280  	return t.sendTrue(ticketCh)
   281  }
   282  
   283  // resolveWaitLeader will send true on the supplied channel if leadership can be
   284  // guaranteed for the Tracker's duration. It will then close the channel. If
   285  // leadership cannot be guaranteed, the channel is left untouched until either
   286  // the termination of the Tracker or the next invocation of setLeader; at which
   287  // point true is sent if applicable, and the channel is closed.
   288  func (t *Tracker) resolveWaitLeader(ticketCh chan bool) error {
   289  	var dontClose bool
   290  	defer func() {
   291  		if !dontClose {
   292  			close(ticketCh)
   293  		}
   294  	}()
   295  
   296  	if leader, err := t.isLeader(); err != nil {
   297  		return errors.Trace(err)
   298  	} else if leader {
   299  		logger.Tracef("reporting %s leadership for %s", t.applicationName, t.unitName)
   300  		return t.sendTrue(ticketCh)
   301  	}
   302  
   303  	logger.Tracef("waiting for %s to attain %s leadership", t.unitName, t.applicationName)
   304  	t.waitingLeader = append(t.waitingLeader, ticketCh)
   305  	dontClose = true
   306  	return nil
   307  }
   308  
   309  // resolveWaitMinion will close the supplied channel as soon as leadership cannot
   310  // be guaranteed beyond the Tracker's duration.
   311  func (t *Tracker) resolveWaitMinion(ticketCh chan bool) error {
   312  	var dontClose bool
   313  	defer func() {
   314  		if !dontClose {
   315  			close(ticketCh)
   316  		}
   317  	}()
   318  
   319  	if leader, err := t.isLeader(); err != nil {
   320  		return errors.Trace(err)
   321  	} else if leader {
   322  		logger.Tracef("waiting for %s to lose %s leadership", t.unitName, t.applicationName)
   323  		t.waitingMinion = append(t.waitingMinion, ticketCh)
   324  		dontClose = true
   325  	} else {
   326  		logger.Tracef("reporting %s leadership loss for %s", t.applicationName, t.unitName)
   327  	}
   328  	return nil
   329  
   330  }
   331  
   332  func (t *Tracker) sendTrue(ticketCh chan bool) error {
   333  	select {
   334  	case <-t.tomb.Dying():
   335  		return tomb.ErrDying
   336  	case ticketCh <- true:
   337  		return nil
   338  	}
   339  }
   340  
   341  func (t *Tracker) submit(tickets chan chan bool) leadership.Ticket {
   342  	ticketCh := make(chan bool, 1)
   343  	select {
   344  	case <-t.tomb.Dying():
   345  		close(ticketCh)
   346  	case tickets <- ticketCh:
   347  	}
   348  	ticket := &ticket{
   349  		ch:    ticketCh,
   350  		ready: make(chan struct{}),
   351  	}
   352  	go ticket.run()
   353  	return ticket
   354  }
   355  
   356  // ticket is used by Tracker to communicate leadership status back to a client.
   357  type ticket struct {
   358  	ch      chan bool
   359  	ready   chan struct{}
   360  	success bool
   361  }
   362  
   363  func (t *ticket) run() {
   364  	defer close(t.ready)
   365  	// This is only safe/sane because the Tracker promises to close all pending
   366  	// ticket channels when it shuts down.
   367  	if <-t.ch {
   368  		t.success = true
   369  	}
   370  }
   371  
   372  // Ready is part of the leadership.Ticket interface.
   373  func (t *ticket) Ready() <-chan struct{} {
   374  	return t.ready
   375  }
   376  
   377  // Wait is part of the leadership.Ticket interface.
   378  func (t *ticket) Wait() bool {
   379  	<-t.ready
   380  	return t.success
   381  }