github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/lease/manager.go (about)

     1  // Copyright 2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package lease
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"io"
    10  	"os"
    11  	"path/filepath"
    12  	"runtime/pprof"
    13  	"sync"
    14  	"sync/atomic"
    15  	"time"
    16  
    17  	"github.com/juju/clock"
    18  	"github.com/juju/errors"
    19  	"github.com/prometheus/client_golang/prometheus"
    20  	"gopkg.in/retry.v1"
    21  	"gopkg.in/tomb.v2"
    22  
    23  	"github.com/juju/juju/core/lease"
    24  	"github.com/juju/juju/database/txn"
    25  )
    26  
    27  const (
    28  	// maxRetries gives the maximum number of attempts we'll try if
    29  	// there are timeouts.
    30  	maxRetries = 10
    31  
    32  	// initialRetryDelay is the starting delay - this will be
    33  	// increased exponentially up maxRetries.
    34  	initialRetryDelay = 50 * time.Millisecond
    35  
    36  	// retryBackoffFactor is how much longer we wait after a failing retry.
    37  	// Retrying 10 times starting at 50ms and backing off 1.6x gives us a total
    38  	// delay time of about 9s.
    39  	retryBackoffFactor = 1.6
    40  
    41  	// maxShutdownWait is the maximum time to wait for the async
    42  	// claims and expires to complete before stopping the worker
    43  	// anyway. Picked to be slightly quicker than the httpserver
    44  	// shutdown timeout.
    45  	maxShutdownWait = 55 * time.Second
    46  )
    47  
    48  // errStopped is returned to clients when an operation cannot complete because
    49  // the manager has started (and possibly finished) shutdown.
    50  var errStopped = errors.New("lease manager stopped")
    51  
    52  // NewManager returns a new *Manager configured as supplied. The caller takes
    53  // responsibility for killing, and handling errors from, the returned Worker.
    54  func NewManager(config ManagerConfig) (*Manager, error) {
    55  	if err := config.Validate(); err != nil {
    56  		return nil, errors.Trace(err)
    57  	}
    58  	logContext := config.EntityUUID
    59  	if len(logContext) > 6 {
    60  		logContext = logContext[:6]
    61  	}
    62  	manager := &Manager{
    63  		config:     config,
    64  		claims:     make(chan claim),
    65  		revokes:    make(chan revoke),
    66  		checks:     make(chan check),
    67  		blocks:     make(chan block),
    68  		expireDone: make(chan struct{}),
    69  		pins:       make(chan pin),
    70  		unpins:     make(chan pin),
    71  		logContext: logContext,
    72  	}
    73  	manager.tomb.Go(manager.loop)
    74  	return manager, nil
    75  }
    76  
    77  // Manager implements worker.Worker and can be bound to get
    78  // lease.Checkers and lease.Claimers.
    79  type Manager struct {
    80  	tomb tomb.Tomb
    81  
    82  	// config collects all external configuration and dependencies.
    83  	config ManagerConfig
    84  
    85  	// logContext is just a string that associates messages in the log
    86  	// It is seeded with the first six characters of the config.EntityUUID
    87  	// if supplied
    88  	logContext string
    89  
    90  	// nextTimeout is the next time that has a possible expiry that we would care
    91  	// about, capped at the maximum time.
    92  	nextTimeout time.Time
    93  
    94  	// timer tracks when nextTimeout would expire and triggers when it does
    95  	timer clock.Timer
    96  
    97  	// claims is used to deliver lease claim requests to the loop.
    98  	claims chan claim
    99  
   100  	// revokes is used to deliver lease revoke requests to the loop.
   101  	revokes chan revoke
   102  
   103  	// checks is used to deliver lease check requests to the loop.
   104  	checks chan check
   105  
   106  	// expireDone is sent an event when we successfully finish a call to expire()
   107  	expireDone chan struct{}
   108  
   109  	// blocks is used to deliver expiry block requests to the loop.
   110  	blocks chan block
   111  
   112  	// pins is used to deliver lease pin requests to the loop.
   113  	pins chan pin
   114  
   115  	// unpins is used to deliver lease unpin requests to the loop.
   116  	unpins chan pin
   117  
   118  	// wg is used to ensure that all child goroutines are finished
   119  	// before we stop.
   120  	wg sync.WaitGroup
   121  
   122  	// outstandingClaims tracks how many unfinished claim goroutines
   123  	// are running (for debugging purposes).
   124  	outstandingClaims int64
   125  
   126  	// outstandingRevokes tracks how many unfinished revoke goroutines
   127  	// are running (for debugging purposes).
   128  	outstandingRevokes int64
   129  }
   130  
   131  // Kill is part of the worker.Worker interface.
   132  func (manager *Manager) Kill() {
   133  	manager.tomb.Kill(nil)
   134  }
   135  
   136  // Wait is part of the worker.Worker interface.
   137  func (manager *Manager) Wait() error {
   138  	return manager.tomb.Wait()
   139  }
   140  
   141  // loop runs until the manager is stopped.
   142  func (manager *Manager) loop() error {
   143  	if collector, ok := manager.config.Store.(prometheus.Collector); ok && manager.config.PrometheusRegisterer != nil {
   144  		// The store implements the collector interface, but the lease.Store
   145  		// does not expose those.
   146  		_ = manager.config.PrometheusRegisterer.Register(collector)
   147  		defer manager.config.PrometheusRegisterer.Unregister(collector)
   148  	}
   149  
   150  	defer manager.waitForGoroutines()
   151  
   152  	// This context is passed into all lease store operations.
   153  	// Doing this ensures that no such operations can block worker shutdown.
   154  	// Killing the tomb, cancels the context.
   155  	ctx := manager.tomb.Context(context.Background())
   156  
   157  	leases, err := manager.config.Store.Leases(ctx)
   158  	if err != nil {
   159  		return errors.Trace(err)
   160  	}
   161  	manager.computeNextTimeout(leases)
   162  
   163  	blocks := make(blocks)
   164  	for {
   165  		if err := manager.choose(ctx, blocks); err != nil {
   166  			if errors.Is(err, tomb.ErrDying) {
   167  				err = manager.tomb.Err()
   168  			}
   169  			manager.config.Logger.Tracef("[%s] exiting main loop with error: %v", manager.logContext, err)
   170  			return errors.Trace(err)
   171  		}
   172  	}
   173  }
   174  
   175  func (manager *Manager) lookupLease(ctx context.Context, leaseKey lease.Key) (lease.Info, bool, error) {
   176  	leases, err := manager.config.Store.Leases(ctx, leaseKey)
   177  	if err != nil {
   178  		return lease.Info{}, false, errors.Trace(err)
   179  	}
   180  
   181  	l, exists := leases[leaseKey]
   182  	return l, exists, nil
   183  }
   184  
   185  // choose breaks the select out of loop to make the blocking logic clearer.
   186  func (manager *Manager) choose(ctx context.Context, blocks blocks) error {
   187  	select {
   188  	case <-manager.tomb.Dying():
   189  		return tomb.ErrDying
   190  
   191  	case check := <-manager.checks:
   192  		return manager.handleCheck(ctx, check)
   193  
   194  	case now := <-manager.timer.Chan():
   195  		if err := manager.tick(ctx, now, blocks); err != nil {
   196  			return errors.Trace(err)
   197  		}
   198  
   199  	case <-manager.expireDone:
   200  		if err := manager.checkBlocks(ctx, blocks); err != nil {
   201  			return errors.Trace(err)
   202  		}
   203  
   204  	case claim := <-manager.claims:
   205  		manager.startingClaim()
   206  		go manager.retryingClaim(ctx, claim)
   207  
   208  	case revoke := <-manager.revokes:
   209  		manager.startingRevoke()
   210  		go manager.retryingRevoke(ctx, revoke)
   211  
   212  	case pin := <-manager.pins:
   213  		manager.handlePin(ctx, pin)
   214  
   215  	case unpin := <-manager.unpins:
   216  		manager.handleUnpin(ctx, unpin)
   217  
   218  	case block := <-manager.blocks:
   219  		manager.config.Logger.Tracef("[%s] adding block for: %s", manager.logContext, block.leaseKey.Lease)
   220  		blocks.add(block)
   221  	}
   222  	return nil
   223  }
   224  
   225  func (manager *Manager) bind(namespace, modelUUID string) (broker, error) {
   226  	secretary, err := manager.config.Secretary(namespace)
   227  	if err != nil {
   228  		return nil, errors.Trace(err)
   229  	}
   230  	return &boundManager{
   231  		manager:   manager,
   232  		secretary: secretary,
   233  		namespace: namespace,
   234  		modelUUID: modelUUID,
   235  	}, nil
   236  }
   237  
   238  // Checker returns a lease.Checker for the specified namespace and model.
   239  func (manager *Manager) Checker(namespace, modelUUID string) (lease.Checker, error) {
   240  	return manager.bind(namespace, modelUUID)
   241  }
   242  
   243  // Claimer returns a lease.Claimer for the specified namespace and model.
   244  func (manager *Manager) Claimer(namespace, modelUUID string) (lease.Claimer, error) {
   245  	return manager.bind(namespace, modelUUID)
   246  }
   247  
   248  // Revoker returns a lease.Revoker for the specified namespace and model.
   249  func (manager *Manager) Revoker(namespace, modelUUID string) (lease.Revoker, error) {
   250  	return manager.bind(namespace, modelUUID)
   251  }
   252  
   253  // Pinner returns a lease.Pinner for the specified namespace and model.
   254  func (manager *Manager) Pinner(namespace, modelUUID string) (lease.Pinner, error) {
   255  	return manager.bind(namespace, modelUUID)
   256  }
   257  
   258  // Reader returns a lease.Reader for the specified namespace and model.
   259  func (manager *Manager) Reader(namespace, modelUUID string) (lease.Reader, error) {
   260  	return manager.bind(namespace, modelUUID)
   261  }
   262  
   263  // retryingClaim handles timeouts when claiming, and responds to the
   264  // claiming party when it eventually succeeds or fails, or if it times
   265  // out after a number of retries.
   266  func (manager *Manager) retryingClaim(ctx context.Context, claim claim) {
   267  	defer manager.finishedClaim()
   268  	var (
   269  		err     error
   270  		success bool
   271  	)
   272  
   273  	for a := manager.startRetry(); a.Next(); {
   274  		var act action
   275  		act, success, err = manager.handleClaim(ctx, claim)
   276  		if isFatalClaimRetryError(act, err, a.Count()) {
   277  			break
   278  		}
   279  
   280  		if a.More() {
   281  			switch {
   282  			case lease.IsInvalid(err):
   283  				manager.config.Logger.Tracef("[%s] request by %s for lease %s %v, retrying...",
   284  					manager.logContext, claim.holderName, claim.leaseKey.Lease, err)
   285  
   286  			default:
   287  				manager.config.Logger.Tracef("[%s] timed out handling claim by %s for lease %s, retrying...",
   288  					manager.logContext, claim.holderName, claim.leaseKey.Lease)
   289  			}
   290  		}
   291  	}
   292  
   293  	if err == nil {
   294  		if !success {
   295  			claim.respond(lease.ErrClaimDenied)
   296  			return
   297  		}
   298  		claim.respond(nil)
   299  	} else {
   300  		switch {
   301  		case lease.IsTimeout(err), txn.IsErrRetryable(err):
   302  			manager.config.Logger.Warningf("[%s] retrying timed out while handling claim %q for %q",
   303  				manager.logContext, claim.leaseKey, claim.holderName)
   304  			claim.respond(lease.ErrTimeout)
   305  
   306  		case lease.IsInvalid(err):
   307  			// We want to see this, but it doesn't indicate something a user
   308  			// can do something about.
   309  			manager.config.Logger.Infof("[%s] got %v after %d retries, denying claim %q for %q",
   310  				manager.logContext, err, maxRetries, claim.leaseKey, claim.holderName)
   311  			claim.respond(lease.ErrClaimDenied)
   312  
   313  		case lease.IsHeld(err):
   314  			// This can happen in HA if the original check for an extant lease
   315  			// (against the local node) returned nothing, but the leader FSM
   316  			// has this lease being held by another entity.
   317  			manager.config.Logger.Tracef(
   318  				"[%s] %s asked for lease %s, held by by another entity",
   319  				manager.logContext, claim.holderName, claim.leaseKey.Lease)
   320  			claim.respond(lease.ErrClaimDenied)
   321  
   322  		default:
   323  			// Stop the main loop because we got an abnormal error
   324  			manager.tomb.Kill(errors.Trace(err))
   325  		}
   326  	}
   327  }
   328  
   329  type action string
   330  
   331  const (
   332  	claimAction  action = "claim"
   333  	extendAction action = "extend"
   334  )
   335  
   336  func (a action) String() string {
   337  	switch a {
   338  	case claimAction:
   339  		return "claiming"
   340  	case extendAction:
   341  		return "extending"
   342  	}
   343  	return "unknown"
   344  }
   345  
   346  // handleClaim processes the supplied claim. It will only return
   347  // unrecoverable errors or timeouts; mere failure to claim just
   348  // indicates a bad request, and is returned as (false, nil).
   349  func (manager *Manager) handleClaim(ctx context.Context, claim claim) (action, bool, error) {
   350  	logger := manager.config.Logger
   351  	var act action
   352  
   353  	select {
   354  	case <-manager.tomb.Dying():
   355  		return "unknown", false, tomb.ErrDying
   356  	default:
   357  		info, found, err := manager.lookupLease(ctx, claim.leaseKey)
   358  		if err != nil {
   359  			return "unknown", false, errors.Trace(err)
   360  		}
   361  
   362  		request := lease.Request{Holder: claim.holderName, Duration: claim.duration}
   363  		store := manager.config.Store
   364  
   365  		switch {
   366  		case !found:
   367  			logger.Tracef("[%s] %s asked for lease %s (%s), no lease found, claiming for %s",
   368  				manager.logContext, claim.holderName, claim.leaseKey.Lease, claim.leaseKey.Namespace, claim.duration)
   369  			act = claimAction
   370  			err = store.ClaimLease(ctx, claim.leaseKey, request)
   371  
   372  		case info.Holder == claim.holderName:
   373  			logger.Tracef("[%s] %s extending lease %s (%s) for %s",
   374  				manager.logContext, claim.holderName, claim.leaseKey.Lease, claim.leaseKey.Namespace, claim.duration)
   375  			act = extendAction
   376  			err = store.ExtendLease(ctx, claim.leaseKey, request)
   377  
   378  		default:
   379  			// Note: (jam) 2017-10-31) We don't check here if the lease has
   380  			// expired for the current holder. Should we?
   381  			remaining := info.Expiry.Sub(manager.config.Clock.Now())
   382  			logger.Tracef("[%s] %s asked for lease %s, held by %s for another %s, rejecting",
   383  				manager.logContext, claim.holderName, claim.leaseKey.Lease, info.Holder, remaining)
   384  			return "unknown", false, nil
   385  		}
   386  
   387  		if lease.IsAborted(err) {
   388  			return act, false, tomb.ErrDying
   389  		}
   390  		if err != nil {
   391  			return act, false, errors.Trace(err)
   392  		}
   393  	}
   394  
   395  	logger.Tracef("[%s] %s %s lease %s for %s successful",
   396  		manager.logContext, claim.holderName, act.String(), claim.leaseKey.Lease, claim.duration)
   397  	return act, true, nil
   398  }
   399  
   400  // retryingRevoke handles timeouts when revoking, and responds to the
   401  // revoking party when it eventually succeeds or fails, or if it times
   402  // out after a number of retries.
   403  func (manager *Manager) retryingRevoke(ctx context.Context, revoke revoke) {
   404  	defer manager.finishedRevoke()
   405  	var err error
   406  	for a := manager.startRetry(); a.Next(); {
   407  		err = manager.handleRevoke(ctx, revoke)
   408  		if isFatalRetryError(err) {
   409  			break
   410  		}
   411  
   412  		if a.More() {
   413  			switch {
   414  			case lease.IsInvalid(err):
   415  				manager.config.Logger.Tracef("[%s] request by %s for revoking lease %s %v, retrying...",
   416  					manager.logContext, revoke.holderName, revoke.leaseKey.Lease, err)
   417  
   418  			default:
   419  				manager.config.Logger.Tracef("[%s] timed out handling revoke by %s for lease %s, retrying...",
   420  					manager.logContext, revoke.holderName, revoke.leaseKey.Lease)
   421  			}
   422  		}
   423  	}
   424  
   425  	if err == nil {
   426  		revoke.respond(nil)
   427  		// If we send back an error, then the main loop won't listen for expireDone
   428  		select {
   429  		case <-manager.tomb.Dying():
   430  			return
   431  		case manager.expireDone <- struct{}{}:
   432  		}
   433  	} else {
   434  		switch {
   435  		case lease.IsTimeout(err), txn.IsErrRetryable(err):
   436  			manager.config.Logger.Warningf("[%s] retrying timed out while handling revoke %q for %q",
   437  				manager.logContext, revoke.leaseKey, revoke.holderName)
   438  			revoke.respond(lease.ErrTimeout)
   439  
   440  		case lease.IsInvalid(err):
   441  			// we want to see this, but it doesn't indicate something a user can do something about
   442  			manager.config.Logger.Infof("[%s] got %v after %d retries, revoke %q for %q",
   443  				manager.logContext, err, maxRetries, revoke.leaseKey, revoke.holderName)
   444  			revoke.respond(err)
   445  
   446  		case lease.IsNotHeld(err):
   447  			// we want to see this, but it doesn't indicate something a user can do something about
   448  			manager.config.Logger.Infof("[%s] got %v after %d retries, revoke %q for %q",
   449  				manager.logContext, err, maxRetries, revoke.leaseKey, revoke.holderName)
   450  			revoke.respond(err)
   451  
   452  		default:
   453  			// Stop the main loop because we got an abnormal error
   454  			manager.tomb.Kill(errors.Trace(err))
   455  		}
   456  	}
   457  }
   458  
   459  // handleRevoke processes the supplied revocation. It will only return
   460  // unrecoverable errors or timeouts.
   461  func (manager *Manager) handleRevoke(ctx context.Context, revoke revoke) error {
   462  	logger := manager.config.Logger
   463  
   464  	select {
   465  	case <-manager.tomb.Dying():
   466  		return tomb.ErrDying
   467  	default:
   468  		info, found, err := manager.lookupLease(ctx, revoke.leaseKey)
   469  		if err != nil {
   470  			return errors.Trace(err)
   471  		}
   472  
   473  		switch {
   474  		case !found:
   475  			logger.Tracef("[%s] %s asked to revoke lease %s, no lease found",
   476  				manager.logContext, revoke.holderName, revoke.leaseKey.Lease)
   477  			return nil
   478  
   479  		case info.Holder == revoke.holderName:
   480  			logger.Tracef("[%s] %s revoking lease %s", manager.logContext, revoke.holderName, revoke.leaseKey.Lease)
   481  			err = manager.config.Store.RevokeLease(ctx, revoke.leaseKey, revoke.holderName)
   482  
   483  		default:
   484  			logger.Tracef("[%s] %s revoking lease %s, held by %s, rejecting",
   485  				manager.logContext, revoke.holderName, revoke.leaseKey.Lease, info.Holder)
   486  			return lease.ErrNotHeld
   487  		}
   488  
   489  		if lease.IsAborted(err) {
   490  			return tomb.ErrDying
   491  		}
   492  		if err != nil {
   493  			return errors.Trace(err)
   494  		}
   495  	}
   496  
   497  	logger.Tracef("[%s] %s revoked lease %s successful", manager.logContext, revoke.holderName, revoke.leaseKey.Lease)
   498  	return nil
   499  }
   500  
   501  // handleCheck processes and responds to the supplied check. It will only return
   502  // unrecoverable errors; mere untruth of the assertion just indicates a bad
   503  // request, and is communicated back to the check's originator.
   504  func (manager *Manager) handleCheck(ctx context.Context, check check) error {
   505  	key := check.leaseKey
   506  
   507  	manager.config.Logger.Tracef("[%s] handling Check for lease %s on behalf of %s",
   508  		manager.logContext, key.Lease, check.holderName)
   509  
   510  	info, found, err := manager.lookupLease(ctx, key)
   511  	if err != nil {
   512  		return errors.Trace(err)
   513  	}
   514  
   515  	var response error
   516  	if !found || info.Holder != check.holderName {
   517  		if found {
   518  			manager.config.Logger.Tracef("[%s] handling Check for lease %s on behalf of %s, found held by %s",
   519  				manager.logContext, key.Lease, check.holderName, info.Holder)
   520  		} else {
   521  			// Someone thought they were the lease-holder, otherwise they
   522  			// wouldn't be confirming via the check. However, the lease has
   523  			// expired, and they are out of sync. Schedule a block check.
   524  			manager.setNextTimeout(manager.config.Clock.Now().Add(time.Second))
   525  
   526  			manager.config.Logger.Tracef("[%s] handling Check for lease %s on behalf of %s, not found",
   527  				manager.logContext, key.Lease, check.holderName)
   528  		}
   529  
   530  		response = lease.ErrNotHeld
   531  	}
   532  	check.respond(errors.Trace(response))
   533  	return nil
   534  }
   535  
   536  // tick triggers when we think a lease might be expiring, so we check if there
   537  // are leases to expire, and then unblock anything that is no longer blocked,
   538  // and then compute the next time we should wake up.
   539  func (manager *Manager) tick(ctx context.Context, now time.Time, blocks blocks) error {
   540  	manager.config.Logger.Tracef("[%s] tick at %v, running expiry checks\n", manager.logContext, now)
   541  	// Check for blocks that need to be notified.
   542  	return errors.Trace(manager.checkBlocks(ctx, blocks))
   543  }
   544  
   545  func (manager *Manager) checkBlocks(ctx context.Context, blocks blocks) error {
   546  	manager.config.Logger.Tracef("[%s] evaluating %d blocks", manager.logContext, len(blocks))
   547  
   548  	leases, err := manager.config.Store.Leases(ctx)
   549  	if err != nil {
   550  		return errors.Trace(err)
   551  	}
   552  
   553  	for leaseName := range blocks {
   554  		if _, found := leases[leaseName]; !found {
   555  			manager.config.Logger.Tracef("[%s] unblocking: %s", manager.logContext, leaseName)
   556  			blocks.unblock(leaseName)
   557  		}
   558  	}
   559  	manager.computeNextTimeout(leases)
   560  	return nil
   561  }
   562  
   563  // computeNextTimeout iterates the leases and finds out what the next time we
   564  // want to wake up, expire any leases and then handle any unblocks that happen.
   565  // It is the earliest lease expiration due in the future, but before MaxSleep.
   566  func (manager *Manager) computeNextTimeout(leases map[lease.Key]lease.Info) {
   567  	now := manager.config.Clock.Now()
   568  	nextTick := now.Add(manager.config.MaxSleep)
   569  	for _, info := range leases {
   570  		if info.Expiry.After(nextTick) {
   571  			continue
   572  		}
   573  		nextTick = info.Expiry
   574  	}
   575  
   576  	// If we had leases set to expire in the past, then we assume that our FSM
   577  	// is behind the leader and will soon indicate their expiration.
   578  	// Check the blocks again soon.
   579  	if !nextTick.After(now) {
   580  		nextTick = now
   581  	}
   582  
   583  	// The lease clock ticks *at least* a second from now. Expirations only
   584  	// occur when the global clock updater ticks the clock, so this avoids
   585  	// too frequently checking with the potential of having no work to do.
   586  	// The blanket addition of a second is no big deal.
   587  	nextTick = nextTick.Add(time.Second)
   588  
   589  	nextDuration := nextTick.Sub(now).Round(time.Millisecond)
   590  	manager.config.Logger.Tracef("[%s] next expire in %v %v", manager.logContext, nextDuration, nextTick)
   591  	manager.setNextTimeout(nextTick)
   592  }
   593  
   594  func (manager *Manager) setNextTimeout(t time.Time) {
   595  	now := manager.config.Clock.Now()
   596  
   597  	// Ensure we never walk the next check back without have performed a
   598  	// scheduled check *unless* we think our last check was in the past.
   599  	if !manager.nextTimeout.Before(now) && !t.Before(manager.nextTimeout) {
   600  		manager.config.Logger.Tracef("[%s] not rescheduling check from %v to %v based on current time %v",
   601  			manager.logContext, manager.nextTimeout, t, now)
   602  		return
   603  	}
   604  	manager.nextTimeout = t
   605  
   606  	d := t.Sub(now)
   607  	if manager.timer == nil {
   608  		manager.timer = manager.config.Clock.NewTimer(d)
   609  	} else {
   610  		// See the docs on Timer.Reset() that says it isn't safe to call
   611  		// on a non-stopped channel, and if it is stopped, you need to check
   612  		// if the channel needs to be drained anyway. It isn't safe to drain
   613  		// unconditionally in case another goroutine has already noticed,
   614  		// but make an attempt.
   615  		if !manager.timer.Stop() {
   616  			select {
   617  			case <-manager.timer.Chan():
   618  			default:
   619  			}
   620  		}
   621  		manager.timer.Reset(d)
   622  	}
   623  }
   624  
   625  func (manager *Manager) startRetry() *retry.Attempt {
   626  	return retry.StartWithCancel(
   627  		retry.LimitCount(maxRetries, retry.Exponential{
   628  			Initial: initialRetryDelay,
   629  			Factor:  retryBackoffFactor,
   630  			Jitter:  true,
   631  		}),
   632  		manager.config.Clock,
   633  		manager.tomb.Dying(),
   634  	)
   635  }
   636  
   637  func isFatalRetryError(err error) bool {
   638  	switch {
   639  	case txn.IsErrRetryable(err):
   640  		return false
   641  	case lease.IsTimeout(err):
   642  		return false
   643  	case lease.IsInvalid(err):
   644  		return false
   645  	}
   646  	return true
   647  }
   648  
   649  func isFatalClaimRetryError(act action, err error, count int) bool {
   650  	switch {
   651  	case txn.IsErrRetryable(err):
   652  		return false
   653  	case lease.IsTimeout(err):
   654  		return false
   655  	case lease.IsInvalid(err):
   656  		return false
   657  	}
   658  	return true
   659  }
   660  
   661  func (manager *Manager) handlePin(ctx context.Context, p pin) {
   662  	p.respond(errors.Trace(manager.config.Store.PinLease(ctx, p.leaseKey, p.entity)))
   663  }
   664  
   665  func (manager *Manager) handleUnpin(ctx context.Context, p pin) {
   666  	p.respond(errors.Trace(manager.config.Store.UnpinLease(ctx, p.leaseKey, p.entity)))
   667  }
   668  
   669  // pinned returns lease names and the entities requiring their pinned
   670  // behaviour, from the input namespace/model for which leases are pinned.
   671  func (manager *Manager) pinned(ctx context.Context, namespace, modelUUID string) (map[string][]string, error) {
   672  	pinned, err := manager.config.Store.Pinned(ctx)
   673  	if err != nil {
   674  		return nil, errors.Trace(err)
   675  	}
   676  
   677  	result := make(map[string][]string)
   678  	for key, entities := range pinned {
   679  		if key.Namespace == namespace && key.ModelUUID == modelUUID {
   680  			result[key.Lease] = entities
   681  		}
   682  	}
   683  	return result, nil
   684  }
   685  
   686  func (manager *Manager) leases(ctx context.Context, namespace, modelUUID string) (map[string]string, error) {
   687  	group, err := manager.config.Store.LeaseGroup(ctx, namespace, modelUUID)
   688  	if err != nil {
   689  		return nil, errors.Trace(err)
   690  	}
   691  
   692  	leases := make(map[string]string)
   693  	for key, info := range group {
   694  		leases[key.Lease] = info.Holder
   695  	}
   696  	return leases, nil
   697  }
   698  
   699  func (manager *Manager) startingClaim() {
   700  	atomic.AddInt64(&manager.outstandingClaims, 1)
   701  	manager.wg.Add(1)
   702  }
   703  
   704  func (manager *Manager) finishedClaim() {
   705  	manager.wg.Done()
   706  	atomic.AddInt64(&manager.outstandingClaims, -1)
   707  }
   708  
   709  func (manager *Manager) startingRevoke() {
   710  	atomic.AddInt64(&manager.outstandingRevokes, 1)
   711  	manager.wg.Add(1)
   712  }
   713  
   714  func (manager *Manager) finishedRevoke() {
   715  	manager.wg.Done()
   716  	atomic.AddInt64(&manager.outstandingRevokes, -1)
   717  }
   718  
   719  // Report is part of dependency.Reporter
   720  func (manager *Manager) Report() map[string]interface{} {
   721  	out := make(map[string]interface{})
   722  	out["entity-uuid"] = manager.config.EntityUUID
   723  	out["outstanding-claims"] = atomic.LoadInt64(&manager.outstandingClaims)
   724  	out["outstanding-revokes"] = atomic.LoadInt64(&manager.outstandingRevokes)
   725  	return out
   726  }
   727  
   728  func (manager *Manager) waitForGoroutines() {
   729  	// Wait for the waitgroup to finish, but only up to a point.
   730  	groupDone := make(chan struct{})
   731  	go func() {
   732  		manager.wg.Wait()
   733  		close(groupDone)
   734  	}()
   735  
   736  	select {
   737  	case <-groupDone:
   738  		return
   739  	case <-manager.config.Clock.After(maxShutdownWait):
   740  	}
   741  	msg := "timeout waiting for lease manager shutdown"
   742  	dumpFile, err := manager.dumpDebug()
   743  	logger := manager.config.Logger
   744  	if err == nil {
   745  		logger.Warningf("%v\ndebug info written to %v", msg, dumpFile)
   746  	} else {
   747  		logger.Warningf("%v\nerror writing debug info: %v", msg, err)
   748  	}
   749  
   750  }
   751  
   752  func (manager *Manager) dumpDebug() (string, error) {
   753  	dumpFile, err := os.OpenFile(filepath.Join(manager.config.LogDir, "lease-manager-debug.log"), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
   754  	if err != nil {
   755  		return "", errors.Trace(err)
   756  	}
   757  	defer func() { _ = dumpFile.Close() }()
   758  
   759  	claims := atomic.LoadInt64(&manager.outstandingClaims)
   760  	revokes := atomic.LoadInt64(&manager.outstandingRevokes)
   761  	template := `
   762  lease manager state dump %v
   763  entity-uuid: %v
   764  outstanding-claims: %v
   765  outstanding-revokes: %v
   766  
   767  `[1:]
   768  	message := fmt.Sprintf(template,
   769  		time.Now().Format(time.RFC3339),
   770  		manager.config.EntityUUID,
   771  		claims,
   772  		revokes,
   773  	)
   774  	if _, err = io.WriteString(dumpFile, message); err != nil {
   775  		return "", errors.Annotate(err, "writing state to debug log file")
   776  	}
   777  	// Including the goroutines because the httpserver won't dump them
   778  	// anymore if this worker stops happily.
   779  	return dumpFile.Name(), pprof.Lookup("goroutine").WriteTo(dumpFile, 1)
   780  }