github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/catalog/lease/lease.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  // Package lease provides functionality to create and manage sql schema leases.
    12  package lease
    13  
    14  import (
    15  	"bytes"
    16  	"context"
    17  	"fmt"
    18  	"math/rand"
    19  	"sort"
    20  	"strings"
    21  	"sync"
    22  	"sync/atomic"
    23  	"time"
    24  
    25  	"github.com/cockroachdb/cockroach/pkg/base"
    26  	"github.com/cockroachdb/cockroach/pkg/clusterversion"
    27  	"github.com/cockroachdb/cockroach/pkg/gossip"
    28  	"github.com/cockroachdb/cockroach/pkg/keys"
    29  	"github.com/cockroachdb/cockroach/pkg/kv"
    30  	"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
    31  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    32  	"github.com/cockroachdb/cockroach/pkg/security"
    33  	"github.com/cockroachdb/cockroach/pkg/settings"
    34  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    35  	"github.com/cockroachdb/cockroach/pkg/sql/catalog/catalogkv"
    36  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    37  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    38  	"github.com/cockroachdb/cockroach/pkg/sql/sqlutil"
    39  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    40  	"github.com/cockroachdb/cockroach/pkg/util/log"
    41  	"github.com/cockroachdb/cockroach/pkg/util/quotapool"
    42  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    43  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    44  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    45  	"github.com/cockroachdb/cockroach/pkg/util/syncutil/singleflight"
    46  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    47  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    48  	"github.com/cockroachdb/errors"
    49  	"github.com/cockroachdb/logtags"
    50  )
    51  
    52  var errRenewLease = errors.New("renew lease on id")
    53  var errReadOlderTableVersion = errors.New("read older table version from store")
    54  
    55  // A lease stored in system.lease.
    56  type storedTableLease struct {
    57  	id         sqlbase.ID
    58  	version    int
    59  	expiration tree.DTimestamp
    60  }
    61  
    62  // tableVersionState holds the state for a table version. This includes
    63  // the lease information for a table version.
    64  // TODO(vivek): A node only needs to manage lease information on what it
    65  // thinks is the latest version for a table descriptor.
    66  type tableVersionState struct {
    67  	// This descriptor is immutable and can be shared by many goroutines.
    68  	// Care must be taken to not modify it.
    69  	sqlbase.ImmutableTableDescriptor
    70  
    71  	// The expiration time for the table version. A transaction with
    72  	// timestamp T can use this table descriptor version iff
    73  	// TableDescriptor.ModificationTime <= T < expiration
    74  	//
    75  	// The expiration time is either the expiration time of the lease
    76  	// when a lease is associated with the table version, or the
    77  	// ModificationTime of the next version when the table version
    78  	// isn't associated with a lease.
    79  	expiration hlc.Timestamp
    80  
    81  	mu struct {
    82  		syncutil.Mutex
    83  
    84  		refcount int
    85  		// Set if the node has a lease on this descriptor version.
    86  		// Leases can only be held for the two latest versions of
    87  		// a table descriptor. The latest version known to a node
    88  		// (can be different than the current latest version in the store)
    89  		// is always associated with a lease. The previous version known to
    90  		// a node might not necessarily be associated with a lease.
    91  		lease *storedTableLease
    92  	}
    93  }
    94  
    95  func (s *tableVersionState) String() string {
    96  	s.mu.Lock()
    97  	defer s.mu.Unlock()
    98  	return s.stringLocked()
    99  }
   100  
   101  // stringLocked reads mu.refcount and thus needs to have mu held.
   102  func (s *tableVersionState) stringLocked() string {
   103  	return fmt.Sprintf("%d(%q) ver=%d:%s, refcount=%d", s.ID, s.Name, s.Version, s.expiration, s.mu.refcount)
   104  }
   105  
   106  // hasExpired checks if the table is too old to be used (by a txn operating)
   107  // at the given timestamp
   108  func (s *tableVersionState) hasExpired(timestamp hlc.Timestamp) bool {
   109  	return s.expiration.LessEq(timestamp)
   110  }
   111  
   112  // hasValidExpiration checks that this table have a larger expiration than
   113  // the existing one it is replacing. This can be used to check the
   114  // monotonicity of the expiration times on a table at a particular version.
   115  // The version is not explicitly checked here.
   116  func (s *tableVersionState) hasValidExpiration(existing *tableVersionState) bool {
   117  	return existing.expiration.Less(s.expiration)
   118  }
   119  
   120  func (s *tableVersionState) incRefcount() {
   121  	s.mu.Lock()
   122  	s.incRefcountLocked()
   123  	s.mu.Unlock()
   124  }
   125  
   126  func (s *tableVersionState) incRefcountLocked() {
   127  	s.mu.refcount++
   128  	if log.V(2) {
   129  		log.VEventf(context.TODO(), 2, "tableVersionState.incRef: %s", s.stringLocked())
   130  	}
   131  }
   132  
   133  // The lease expiration stored in the database is of a different type.
   134  // We've decided that it's too much work to change the type to
   135  // hlc.Timestamp, so we're using this method to give us the stored
   136  // type: tree.DTimestamp.
   137  func storedLeaseExpiration(expiration hlc.Timestamp) tree.DTimestamp {
   138  	return tree.DTimestamp{Time: timeutil.Unix(0, expiration.WallTime).Round(time.Microsecond)}
   139  }
   140  
   141  // Storage implements the operations for acquiring and releasing leases and
   142  // publishing a new version of a descriptor. Exported only for testing.
   143  type Storage struct {
   144  	nodeIDContainer  *base.SQLIDContainer
   145  	db               *kv.DB
   146  	clock            *hlc.Clock
   147  	internalExecutor sqlutil.InternalExecutor
   148  	settings         *cluster.Settings
   149  	codec            keys.SQLCodec
   150  
   151  	// group is used for all calls made to acquireNodeLease to prevent
   152  	// concurrent lease acquisitions from the store.
   153  	group *singleflight.Group
   154  
   155  	// leaseDuration is the mean duration a lease will be acquired for. The
   156  	// actual duration is jittered using leaseJitterFraction. Jittering is done to
   157  	// prevent multiple leases from being renewed simultaneously if they were all
   158  	// acquired simultaneously.
   159  	leaseDuration time.Duration
   160  	// leaseJitterFraction is the factor that we use to randomly jitter the lease
   161  	// duration when acquiring a new lease and the lease renewal timeout. The
   162  	// range of the actual lease duration will be
   163  	// [(1-leaseJitterFraction) * leaseDuration, (1+leaseJitterFraction) * leaseDuration]
   164  	leaseJitterFraction float64
   165  	// leaseRenewalTimeout is the time before a lease expires when
   166  	// acquisition to renew the lease begins.
   167  	leaseRenewalTimeout time.Duration
   168  
   169  	testingKnobs StorageTestingKnobs
   170  }
   171  
   172  // jitteredLeaseDuration returns a randomly jittered duration from the interval
   173  // [(1-leaseJitterFraction) * leaseDuration, (1+leaseJitterFraction) * leaseDuration].
   174  func (s Storage) jitteredLeaseDuration() time.Duration {
   175  	return time.Duration(float64(s.leaseDuration) * (1 - s.leaseJitterFraction +
   176  		2*s.leaseJitterFraction*rand.Float64()))
   177  }
   178  
   179  // acquire a lease on the most recent version of a table descriptor.
   180  // If the lease cannot be obtained because the descriptor is in the process of
   181  // being dropped or offline, the error will be of type inactiveTableError.
   182  // The expiration time set for the lease > minExpiration.
   183  func (s Storage) acquire(
   184  	ctx context.Context, minExpiration hlc.Timestamp, tableID sqlbase.ID,
   185  ) (*tableVersionState, error) {
   186  	var table *tableVersionState
   187  	err := s.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   188  		// Run the descriptor read as high-priority, thereby pushing any intents out
   189  		// of its way. We don't want schema changes to prevent lease acquisitions;
   190  		// we'd rather force them to refresh. Also this prevents deadlocks in cases
   191  		// where the name resolution is triggered by the transaction doing the
   192  		// schema change itself.
   193  		if err := txn.SetUserPriority(roachpb.MaxUserPriority); err != nil {
   194  			return err
   195  		}
   196  		expiration := txn.ReadTimestamp()
   197  		expiration.WallTime += int64(s.jitteredLeaseDuration())
   198  		if expiration.LessEq(minExpiration) {
   199  			// In the rare circumstances where expiration <= minExpiration
   200  			// use an expiration based on the minExpiration to guarantee
   201  			// a monotonically increasing expiration.
   202  			expiration = minExpiration.Add(int64(time.Millisecond), 0)
   203  		}
   204  
   205  		tableDesc, err := sqlbase.GetTableDescFromID(ctx, txn, s.codec, tableID)
   206  		if err != nil {
   207  			return err
   208  		}
   209  		if err := sqlbase.FilterTableState(tableDesc); err != nil {
   210  			return err
   211  		}
   212  		if err := tableDesc.MaybeFillInDescriptor(ctx, txn, s.codec); err != nil {
   213  			return err
   214  		}
   215  		// Once the descriptor is set it is immutable and care must be taken
   216  		// to not modify it.
   217  		storedLease := &storedTableLease{
   218  			id:         tableDesc.ID,
   219  			version:    int(tableDesc.Version),
   220  			expiration: storedLeaseExpiration(expiration),
   221  		}
   222  		table = &tableVersionState{
   223  			ImmutableTableDescriptor: *sqlbase.NewImmutableTableDescriptor(*tableDesc),
   224  			expiration:               expiration,
   225  		}
   226  		log.VEventf(ctx, 2, "Storage acquired lease %+v", storedLease)
   227  		table.mu.lease = storedLease
   228  
   229  		// ValidateTable instead of Validate, even though we have a txn available,
   230  		// so we don't block reads waiting for this table version.
   231  		if err := table.ValidateTable(); err != nil {
   232  			return err
   233  		}
   234  
   235  		nodeID := s.nodeIDContainer.SQLInstanceID()
   236  		if nodeID == 0 {
   237  			panic("zero nodeID")
   238  		}
   239  
   240  		// We use string interpolation here, instead of passing the arguments to
   241  		// InternalExecutor.Exec() because we don't want to pay for preparing the
   242  		// statement (which would happen if we'd pass arguments). Besides the
   243  		// general cost of preparing, preparing this statement always requires a
   244  		// read from the database for the special descriptor of a system table
   245  		// (#23937).
   246  		insertLease := fmt.Sprintf(
   247  			`INSERT INTO system.public.lease ("descID", version, "nodeID", expiration) VALUES (%d, %d, %d, %s)`,
   248  			storedLease.id, storedLease.version, nodeID, &storedLease.expiration,
   249  		)
   250  		count, err := s.internalExecutor.Exec(ctx, "lease-insert", txn, insertLease)
   251  		if err != nil {
   252  			return err
   253  		}
   254  		if count != 1 {
   255  			return errors.Errorf("%s: expected 1 result, found %d", insertLease, count)
   256  		}
   257  		return nil
   258  	})
   259  	if err == nil && s.testingKnobs.LeaseAcquiredEvent != nil {
   260  		s.testingKnobs.LeaseAcquiredEvent(table.TableDescriptor, nil)
   261  	}
   262  	return table, err
   263  }
   264  
   265  // Release a previously acquired table descriptor. Never let this method
   266  // read a table descriptor because it can be called while modifying a
   267  // descriptor through a schema change before the schema change has committed
   268  // that can result in a deadlock.
   269  func (s Storage) release(ctx context.Context, stopper *stop.Stopper, lease *storedTableLease) {
   270  	retryOptions := base.DefaultRetryOptions()
   271  	retryOptions.Closer = stopper.ShouldQuiesce()
   272  	firstAttempt := true
   273  	// This transaction is idempotent; the retry was put in place because of
   274  	// NodeUnavailableErrors.
   275  	for r := retry.Start(retryOptions); r.Next(); {
   276  		log.VEventf(ctx, 2, "Storage releasing lease %+v", lease)
   277  		nodeID := s.nodeIDContainer.SQLInstanceID()
   278  		if nodeID == 0 {
   279  			panic("zero nodeID")
   280  		}
   281  		const deleteLease = `DELETE FROM system.public.lease ` +
   282  			`WHERE ("descID", version, "nodeID", expiration) = ($1, $2, $3, $4)`
   283  		count, err := s.internalExecutor.Exec(
   284  			ctx,
   285  			"lease-release",
   286  			nil, /* txn */
   287  			deleteLease,
   288  			lease.id, lease.version, nodeID, &lease.expiration,
   289  		)
   290  		if err != nil {
   291  			log.Warningf(ctx, "error releasing lease %q: %s", lease, err)
   292  			firstAttempt = false
   293  			continue
   294  		}
   295  		// We allow count == 0 after the first attempt.
   296  		if count > 1 || (count == 0 && firstAttempt) {
   297  			log.Warningf(ctx, "unexpected results while deleting lease %+v: "+
   298  				"expected 1 result, found %d", lease, count)
   299  		}
   300  
   301  		if s.testingKnobs.LeaseReleasedEvent != nil {
   302  			s.testingKnobs.LeaseReleasedEvent(
   303  				lease.id, sqlbase.DescriptorVersion(lease.version), err)
   304  		}
   305  		break
   306  	}
   307  }
   308  
   309  // WaitForOneVersion returns once there are no unexpired leases on the
   310  // previous version of the table descriptor. It returns the current version.
   311  // After returning there can only be versions of the descriptor >= to the
   312  // returned version. Lease acquisition (see acquire()) maintains the
   313  // invariant that no new leases for desc.Version-1 will be granted once
   314  // desc.Version exists.
   315  func (s Storage) WaitForOneVersion(
   316  	ctx context.Context, tableID sqlbase.ID, retryOpts retry.Options,
   317  ) (sqlbase.DescriptorVersion, error) {
   318  	var tableDesc *sqlbase.TableDescriptor
   319  	var err error
   320  	for lastCount, r := 0, retry.Start(retryOpts); r.Next(); {
   321  		// Get the current version of the table descriptor non-transactionally.
   322  		//
   323  		// TODO(pmattis): Do an inconsistent read here?
   324  		tableDesc, err = sqlbase.GetTableDescFromID(ctx, s.db, s.codec, tableID)
   325  		if err != nil {
   326  			return 0, err
   327  		}
   328  		// Check to see if there are any leases that still exist on the previous
   329  		// version of the descriptor.
   330  		now := s.clock.Now()
   331  		tables := []IDVersion{NewIDVersionPrev(tableDesc)}
   332  		count, err := CountLeases(ctx, s.internalExecutor, tables, now)
   333  		if err != nil {
   334  			return 0, err
   335  		}
   336  		if count == 0 {
   337  			break
   338  		}
   339  		if count != lastCount {
   340  			lastCount = count
   341  			log.Infof(ctx, "waiting for %d leases to expire: desc=%v", count, tables)
   342  		}
   343  	}
   344  	return tableDesc.Version, nil
   345  }
   346  
   347  // ErrDidntUpdateDescriptor can be returned from the update function passed to
   348  // PublishMultiple to suppress an error being returned and return the original
   349  // values.
   350  var ErrDidntUpdateDescriptor = errors.New("didn't update the table descriptor")
   351  
   352  // PublishMultiple updates multiple table descriptors, maintaining the invariant
   353  // that there are at most two versions of each descriptor out in the wild at any
   354  // time by first waiting for all nodes to be on the current (pre-update) version
   355  // of the table desc.
   356  //
   357  // The update closure for all tables is called after the wait. The map argument
   358  // is a map of the table descriptors with the IDs given in tableIDs, and the
   359  // closure mutates those descriptors. The txn argument closure is intended to be
   360  // used for updating jobs. Note that it can't be used for anything except
   361  // writing to system tables, since we set the system config trigger to write the
   362  // schema changes.
   363  //
   364  // The closure may be called multiple times if retries occur; make sure it does
   365  // not have side effects.
   366  //
   367  // Returns the updated versions of the descriptors.
   368  //
   369  // TODO (lucy): Providing the txn for the update closure just to update a job
   370  // is not ideal. There must be a better API for this.
   371  func (s Storage) PublishMultiple(
   372  	ctx context.Context,
   373  	tableIDs []sqlbase.ID,
   374  	update func(*kv.Txn, map[sqlbase.ID]*sqlbase.MutableTableDescriptor) error,
   375  	logEvent func(*kv.Txn) error,
   376  ) (map[sqlbase.ID]*sqlbase.ImmutableTableDescriptor, error) {
   377  	errLeaseVersionChanged := errors.New("lease version changed")
   378  	// Retry while getting errLeaseVersionChanged.
   379  	for r := retry.Start(base.DefaultRetryOptions()); r.Next(); {
   380  		// Wait until there are no unexpired leases on the previous versions
   381  		// of the tables.
   382  		expectedVersions := make(map[sqlbase.ID]sqlbase.DescriptorVersion)
   383  		for _, id := range tableIDs {
   384  			expected, err := s.WaitForOneVersion(ctx, id, base.DefaultRetryOptions())
   385  			if err != nil {
   386  				return nil, err
   387  			}
   388  			expectedVersions[id] = expected
   389  		}
   390  
   391  		tableDescs := make(map[sqlbase.ID]*sqlbase.MutableTableDescriptor)
   392  		// There should be only one version of the descriptor, but it's
   393  		// a race now to update to the next version.
   394  		err := s.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   395  			versions := make(map[sqlbase.ID]sqlbase.DescriptorVersion)
   396  			descsToUpdate := make(map[sqlbase.ID]*sqlbase.MutableTableDescriptor)
   397  			for _, id := range tableIDs {
   398  				// Re-read the current versions of the table descriptor, this time
   399  				// transactionally.
   400  				var err error
   401  				descsToUpdate[id], err = sqlbase.GetMutableTableDescFromID(ctx, txn, s.codec, id)
   402  				if err != nil {
   403  					return err
   404  				}
   405  
   406  				if expectedVersions[id] != descsToUpdate[id].Version {
   407  					// The version changed out from under us. Someone else must be
   408  					// performing a schema change operation.
   409  					if log.V(3) {
   410  						log.Infof(ctx, "publish (version changed): %d != %d", expectedVersions[id], descsToUpdate[id].Version)
   411  					}
   412  					return errLeaseVersionChanged
   413  				}
   414  
   415  				versions[id] = descsToUpdate[id].Version
   416  			}
   417  
   418  			// This is to write the updated descriptors.
   419  			if err := txn.SetSystemConfigTrigger(); err != nil {
   420  				return err
   421  			}
   422  
   423  			// Run the update closure.
   424  			if err := update(txn, descsToUpdate); err != nil {
   425  				return err
   426  			}
   427  			for _, id := range tableIDs {
   428  				if versions[id] != descsToUpdate[id].Version {
   429  					return errors.Errorf("updated version to: %d, expected: %d",
   430  						descsToUpdate[id].Version, versions[id])
   431  				}
   432  
   433  				if err := descsToUpdate[id].MaybeIncrementVersion(ctx, txn, s.settings); err != nil {
   434  					return err
   435  				}
   436  				if err := descsToUpdate[id].ValidateTable(); err != nil {
   437  					return err
   438  				}
   439  
   440  				tableDescs[id] = descsToUpdate[id]
   441  			}
   442  
   443  			b := txn.NewBatch()
   444  			for tableID, tableDesc := range tableDescs {
   445  				if err := catalogkv.WriteDescToBatch(ctx, false /* kvTrace */, s.settings, b, s.codec, tableID, tableDesc.TableDesc()); err != nil {
   446  					return err
   447  				}
   448  			}
   449  			if logEvent != nil {
   450  				// If an event log is required for this update, ensure that the
   451  				// descriptor change occurs first in the transaction. This is
   452  				// necessary to ensure that the System configuration change is
   453  				// gossiped. See the documentation for
   454  				// transaction.SetSystemConfigTrigger() for more information.
   455  				if err := txn.Run(ctx, b); err != nil {
   456  					return err
   457  				}
   458  				if err := logEvent(txn); err != nil {
   459  					return err
   460  				}
   461  				return txn.Commit(ctx)
   462  			}
   463  			// More efficient batching can be used if no event log message
   464  			// is required.
   465  			return txn.CommitInBatch(ctx, b)
   466  		})
   467  
   468  		switch {
   469  		case err == nil || errors.Is(err, ErrDidntUpdateDescriptor):
   470  			immutTableDescs := make(map[sqlbase.ID]*sqlbase.ImmutableTableDescriptor)
   471  			for id, tableDesc := range tableDescs {
   472  				immutTableDescs[id] = sqlbase.NewImmutableTableDescriptor(tableDesc.TableDescriptor)
   473  			}
   474  			return immutTableDescs, nil
   475  		case errors.Is(err, errLeaseVersionChanged):
   476  			// will loop around to retry
   477  		default:
   478  			return nil, err
   479  		}
   480  	}
   481  
   482  	panic("not reached")
   483  }
   484  
   485  // Publish updates a table descriptor. It also maintains the invariant that
   486  // there are at most two versions of the descriptor out in the wild at any time
   487  // by first waiting for all nodes to be on the current (pre-update) version of
   488  // the table desc.
   489  //
   490  // The update closure is called after the wait, and it provides the new version
   491  // of the descriptor to be written. In a multi-step schema operation, this
   492  // update should perform a single step.
   493  //
   494  // The closure may be called multiple times if retries occur; make sure it does
   495  // not have side effects.
   496  //
   497  // Returns the updated version of the descriptor.
   498  // TODO (lucy): Maybe have the closure take a *kv.Txn to match
   499  // PublishMultiple.
   500  func (s Storage) Publish(
   501  	ctx context.Context,
   502  	tableID sqlbase.ID,
   503  	update func(*sqlbase.MutableTableDescriptor) error,
   504  	logEvent func(*kv.Txn) error,
   505  ) (*sqlbase.ImmutableTableDescriptor, error) {
   506  	tableIDs := []sqlbase.ID{tableID}
   507  	updates := func(_ *kv.Txn, descs map[sqlbase.ID]*sqlbase.MutableTableDescriptor) error {
   508  		desc, ok := descs[tableID]
   509  		if !ok {
   510  			return errors.AssertionFailedf("required table with ID %d not provided to update closure", tableID)
   511  		}
   512  		return update(desc)
   513  	}
   514  
   515  	results, err := s.PublishMultiple(ctx, tableIDs, updates, logEvent)
   516  	if err != nil {
   517  		return nil, err
   518  	}
   519  	return results[tableID], nil
   520  }
   521  
   522  // IDVersion represents a descriptor ID, version pair that are
   523  // meant to map to a single immutable descriptor.
   524  type IDVersion struct {
   525  	// Name is only provided for pretty printing.
   526  	Name    string
   527  	ID      sqlbase.ID
   528  	Version sqlbase.DescriptorVersion
   529  }
   530  
   531  // NewIDVersionPrev returns an initialized IDVersion with the
   532  // previous version of the descriptor.
   533  func NewIDVersionPrev(desc *sqlbase.TableDescriptor) IDVersion {
   534  	return IDVersion{Name: desc.Name, ID: desc.ID, Version: desc.Version - 1}
   535  }
   536  
   537  // CountLeases returns the number of unexpired leases for a number of tables
   538  // each at a particular version at a particular time.
   539  func CountLeases(
   540  	ctx context.Context, executor sqlutil.InternalExecutor, tables []IDVersion, at hlc.Timestamp,
   541  ) (int, error) {
   542  	var whereClauses []string
   543  	for _, t := range tables {
   544  		whereClauses = append(whereClauses,
   545  			fmt.Sprintf(`("descID" = %d AND version = %d AND expiration > $1)`,
   546  				t.ID, t.Version),
   547  		)
   548  	}
   549  
   550  	stmt := fmt.Sprintf(`SELECT count(1) FROM system.public.lease AS OF SYSTEM TIME %s WHERE `,
   551  		at.AsOfSystemTime()) +
   552  		strings.Join(whereClauses, " OR ")
   553  	values, err := executor.QueryRowEx(
   554  		ctx, "count-leases", nil, /* txn */
   555  		sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser},
   556  		stmt, at.GoTime(),
   557  	)
   558  	if err != nil {
   559  		return 0, err
   560  	}
   561  	count := int(tree.MustBeDInt(values[0]))
   562  	return count, nil
   563  }
   564  
   565  // Get the table descriptor valid for the expiration time from the store.
   566  // We use a timestamp that is just less than the expiration time to read
   567  // a version of the table descriptor. A tableVersionState with the
   568  // expiration time set to expiration is returned.
   569  //
   570  // This returns an error when Replica.checkTSAboveGCThresholdRLocked()
   571  // returns an error when the expiration timestamp is less than the storage
   572  // layer GC threshold.
   573  func (s Storage) getForExpiration(
   574  	ctx context.Context, expiration hlc.Timestamp, id sqlbase.ID,
   575  ) (*tableVersionState, error) {
   576  	var table *tableVersionState
   577  	err := s.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   578  		descKey := sqlbase.MakeDescMetadataKey(s.codec, id)
   579  		prevTimestamp := expiration.Prev()
   580  		txn.SetFixedTimestamp(ctx, prevTimestamp)
   581  		var desc sqlbase.Descriptor
   582  		ts, err := txn.GetProtoTs(ctx, descKey, &desc)
   583  		if err != nil {
   584  			return err
   585  		}
   586  		tableDesc := desc.Table(ts)
   587  		if tableDesc == nil {
   588  			return sqlbase.ErrDescriptorNotFound
   589  		}
   590  		if prevTimestamp.LessEq(tableDesc.ModificationTime) {
   591  			return errors.AssertionFailedf("unable to read table= (%d, %s)", id, expiration)
   592  		}
   593  		if err := tableDesc.MaybeFillInDescriptor(ctx, txn, s.codec); err != nil {
   594  			return err
   595  		}
   596  		// Create a tableVersionState with the table and without a lease.
   597  		table = &tableVersionState{
   598  			ImmutableTableDescriptor: *sqlbase.NewImmutableTableDescriptor(*tableDesc),
   599  			expiration:               expiration,
   600  		}
   601  		return nil
   602  	})
   603  	return table, err
   604  }
   605  
   606  // leaseToken is an opaque token representing a lease. It's distinct from a
   607  // lease to define restricted capabilities and prevent improper use of a lease
   608  // where we instead have leaseTokens.
   609  type leaseToken *tableVersionState
   610  
   611  // tableSet maintains an ordered set of tableVersionState objects sorted
   612  // by version. It supports addition and removal of elements, finding the
   613  // table for a particular version, or finding the most recent table version.
   614  // The order is maintained by insert and remove and there can only be a
   615  // unique entry for a version. Only the last two versions can be leased,
   616  // with the last one being the latest one which is always leased.
   617  //
   618  // Each entry represents a time span [ModificationTime, expiration)
   619  // and can be used by a transaction iif:
   620  // ModificationTime <= transaction.Timestamp < expiration.
   621  type tableSet struct {
   622  	data []*tableVersionState
   623  }
   624  
   625  func (l *tableSet) String() string {
   626  	var buf bytes.Buffer
   627  	for i, s := range l.data {
   628  		if i > 0 {
   629  			buf.WriteString(" ")
   630  		}
   631  		buf.WriteString(fmt.Sprintf("%d:%d", s.Version, s.expiration.WallTime))
   632  	}
   633  	return buf.String()
   634  }
   635  
   636  func (l *tableSet) insert(s *tableVersionState) {
   637  	i, match := l.findIndex(s.Version)
   638  	if match {
   639  		panic("unable to insert duplicate lease")
   640  	}
   641  	if i == len(l.data) {
   642  		l.data = append(l.data, s)
   643  		return
   644  	}
   645  	l.data = append(l.data, nil)
   646  	copy(l.data[i+1:], l.data[i:])
   647  	l.data[i] = s
   648  }
   649  
   650  func (l *tableSet) remove(s *tableVersionState) {
   651  	i, match := l.findIndex(s.Version)
   652  	if !match {
   653  		panic(fmt.Sprintf("can't find lease to remove: %s", s))
   654  	}
   655  	l.data = append(l.data[:i], l.data[i+1:]...)
   656  }
   657  
   658  func (l *tableSet) find(version sqlbase.DescriptorVersion) *tableVersionState {
   659  	if i, match := l.findIndex(version); match {
   660  		return l.data[i]
   661  	}
   662  	return nil
   663  }
   664  
   665  func (l *tableSet) findIndex(version sqlbase.DescriptorVersion) (int, bool) {
   666  	i := sort.Search(len(l.data), func(i int) bool {
   667  		s := l.data[i]
   668  		return s.Version >= version
   669  	})
   670  	if i < len(l.data) {
   671  		s := l.data[i]
   672  		if s.Version == version {
   673  			return i, true
   674  		}
   675  	}
   676  	return i, false
   677  }
   678  
   679  func (l *tableSet) findNewest() *tableVersionState {
   680  	if len(l.data) == 0 {
   681  		return nil
   682  	}
   683  	return l.data[len(l.data)-1]
   684  }
   685  
   686  func (l *tableSet) findVersion(version sqlbase.DescriptorVersion) *tableVersionState {
   687  	if len(l.data) == 0 {
   688  		return nil
   689  	}
   690  	// Find the index of the first lease with version > targetVersion.
   691  	i := sort.Search(len(l.data), func(i int) bool {
   692  		return l.data[i].Version > version
   693  	})
   694  	if i == 0 {
   695  		return nil
   696  	}
   697  	// i-1 is the index of the newest lease for the previous version (the version
   698  	// we're looking for).
   699  	s := l.data[i-1]
   700  	if s.Version == version {
   701  		return s
   702  	}
   703  	return nil
   704  }
   705  
   706  type tableState struct {
   707  	id      sqlbase.ID
   708  	stopper *stop.Stopper
   709  
   710  	// renewalInProgress is an atomic indicator for when a renewal for a
   711  	// lease has begun. This is atomic to prevent multiple routines from
   712  	// entering renewal initialization.
   713  	renewalInProgress int32
   714  
   715  	mu struct {
   716  		syncutil.Mutex
   717  
   718  		// table descriptors sorted by increasing version. This set always
   719  		// contains a table descriptor version with a lease as the latest
   720  		// entry. There may be more than one active lease when the system is
   721  		// transitioning from one version of the descriptor to another or
   722  		// when the node preemptively acquires a new lease for a version
   723  		// when the old lease has not yet expired. In the latter case, a new
   724  		// entry is created with the expiration time of the new lease and
   725  		// the older entry is removed.
   726  		active tableSet
   727  		// Indicates that the table has been dropped, or is being dropped.
   728  		// If set, leases are released from the store as soon as their
   729  		// refcount drops to 0, as opposed to waiting until they expire.
   730  		dropped bool
   731  
   732  		// acquisitionsInProgress indicates that at least one caller is currently
   733  		// in the process of performing an acquisition. This tracking is critical
   734  		// to ensure that notifications of new versions which arrive before a lease
   735  		// acquisition finishes but indicate that that new lease is expired are not
   736  		// ignored.
   737  		acquisitionsInProgress int
   738  	}
   739  }
   740  
   741  // ensureVersion ensures that the latest version >= minVersion. It will
   742  // check if the latest known version meets the criterion, or attempt to
   743  // acquire a lease at the latest version with the hope that it meets
   744  // the criterion.
   745  func ensureVersion(
   746  	ctx context.Context, tableID sqlbase.ID, minVersion sqlbase.DescriptorVersion, m *Manager,
   747  ) error {
   748  	if s := m.findNewest(tableID); s != nil && minVersion <= s.Version {
   749  		return nil
   750  	}
   751  
   752  	if err := m.AcquireFreshestFromStore(ctx, tableID); err != nil {
   753  		return err
   754  	}
   755  
   756  	if s := m.findNewest(tableID); s != nil && s.Version < minVersion {
   757  		return errors.Errorf("version %d for table %s does not exist yet", minVersion, s.Name)
   758  	}
   759  	return nil
   760  }
   761  
   762  // findForTimestamp finds a table descriptor valid for the timestamp.
   763  // In the most common case the timestamp passed to this method is close
   764  // to the current time and in all likelihood the latest version of a
   765  // table descriptor if valid is returned.
   766  //
   767  // This returns errRenewLease when there is no table descriptor cached
   768  // or the latest descriptor version's ModificationTime satisfies the
   769  // timestamp while it's expiration time doesn't satisfy the timestamp.
   770  // This is an optimistic strategy betting that in all likelihood a
   771  // higher layer renewing the lease on the descriptor and populating
   772  // tableState will satisfy the timestamp on a subsequent call.
   773  //
   774  // In all other circumstances where a descriptor cannot be found for the
   775  // timestamp errOlderReadTableVersion is returned requesting a higher layer
   776  // to populate the tableState with a valid older version of the descriptor
   777  // before calling.
   778  //
   779  // The refcount for the returned tableVersionState is incremented.
   780  // It returns true if the descriptor returned is the known latest version
   781  // of the descriptor.
   782  func (t *tableState) findForTimestamp(
   783  	ctx context.Context, timestamp hlc.Timestamp,
   784  ) (*tableVersionState, bool, error) {
   785  	t.mu.Lock()
   786  	defer t.mu.Unlock()
   787  
   788  	// Acquire a lease if no table descriptor exists in the cache.
   789  	if len(t.mu.active.data) == 0 {
   790  		return nil, false, errRenewLease
   791  	}
   792  
   793  	// Walk back the versions to find one that is valid for the timestamp.
   794  	for i := len(t.mu.active.data) - 1; i >= 0; i-- {
   795  		// Check to see if the ModificationTime is valid.
   796  		if table := t.mu.active.data[i]; table.ModificationTime.LessEq(timestamp) {
   797  			latest := i+1 == len(t.mu.active.data)
   798  			if !table.hasExpired(timestamp) {
   799  				// Existing valid table version.
   800  				table.incRefcount()
   801  				return table, latest, nil
   802  			}
   803  
   804  			if latest {
   805  				// Renew the lease if the lease has expired
   806  				// The latest descriptor always has a lease.
   807  				return nil, false, errRenewLease
   808  			}
   809  			break
   810  		}
   811  	}
   812  
   813  	return nil, false, errReadOlderTableVersion
   814  }
   815  
   816  // Read an older table descriptor version for the particular timestamp
   817  // from the store. We unfortunately need to read more than one table
   818  // version just so that we can set the expiration time on the descriptor
   819  // properly.
   820  //
   821  // TODO(vivek): Future work:
   822  // 1. Read multiple versions of a descriptor through one kv call.
   823  // 2. Translate multiple simultaneous calls to this method into a single call
   824  //    as is done for acquireNodeLease().
   825  // 3. Figure out a sane policy on when these descriptors should be purged.
   826  //    They are currently purged in PurgeOldVersions.
   827  func (m *Manager) readOlderVersionForTimestamp(
   828  	ctx context.Context, tableID sqlbase.ID, timestamp hlc.Timestamp,
   829  ) ([]*tableVersionState, error) {
   830  	expiration, done := func() (hlc.Timestamp, bool) {
   831  		t := m.findTableState(tableID, false /* create */)
   832  		t.mu.Lock()
   833  		defer t.mu.Unlock()
   834  		afterIdx := 0
   835  		// Walk back the versions to find one that is valid for the timestamp.
   836  		for i := len(t.mu.active.data) - 1; i >= 0; i-- {
   837  			// Check to see if the ModificationTime is valid.
   838  			if table := t.mu.active.data[i]; table.ModificationTime.LessEq(timestamp) {
   839  				if timestamp.Less(table.expiration) {
   840  					// Existing valid table version.
   841  					return table.expiration, true
   842  				}
   843  				// We need a version after data[i], but before data[i+1].
   844  				// We could very well use the timestamp to read the table
   845  				// descriptor, but unfortunately we will not be able to assign
   846  				// it a proper expiration time. Therefore, we read table
   847  				// descriptors versions one by one from afterIdx back into the
   848  				// past until we find a valid one.
   849  				afterIdx = i + 1
   850  				break
   851  			}
   852  		}
   853  
   854  		if afterIdx == len(t.mu.active.data) {
   855  			return hlc.Timestamp{}, true
   856  		}
   857  
   858  		// Read table descriptor versions one by one into the past until we
   859  		// find a valid one. Every version is assigned an expiration time that
   860  		// is the ModificationTime of the previous one read.
   861  		return t.mu.active.data[afterIdx].ModificationTime, false
   862  	}()
   863  	if done {
   864  		return nil, nil
   865  	}
   866  
   867  	// Read descriptors from the store.
   868  	var versions []*tableVersionState
   869  	for {
   870  		table, err := m.Storage.getForExpiration(ctx, expiration, tableID)
   871  		if err != nil {
   872  			return nil, err
   873  		}
   874  		versions = append(versions, table)
   875  		if table.ModificationTime.LessEq(timestamp) {
   876  			break
   877  		}
   878  		// Set the expiration time for the next table.
   879  		expiration = table.ModificationTime
   880  	}
   881  
   882  	return versions, nil
   883  }
   884  
   885  // Insert table versions. The versions provided are not in
   886  // any particular order.
   887  func (m *Manager) insertTableVersions(tableID sqlbase.ID, versions []*tableVersionState) {
   888  	t := m.findTableState(tableID, false /* create */)
   889  	t.mu.Lock()
   890  	defer t.mu.Unlock()
   891  	for _, tableVersion := range versions {
   892  		// Since we gave up the lock while reading the versions from
   893  		// the store we have to ensure that no one else inserted the
   894  		// same table version.
   895  		table := t.mu.active.findVersion(tableVersion.Version)
   896  		if table == nil {
   897  			t.mu.active.insert(tableVersion)
   898  		}
   899  	}
   900  }
   901  
   902  // AcquireFreshestFromStore acquires a new lease from the store and
   903  // inserts it into the active set. It guarantees that the lease returned is
   904  // the one acquired after the call is made. Use this if the lease we want to
   905  // get needs to see some descriptor updates that we know happened recently.
   906  func (m *Manager) AcquireFreshestFromStore(ctx context.Context, tableID sqlbase.ID) error {
   907  	// Create tableState if needed.
   908  	_ = m.findTableState(tableID, true /* create */)
   909  	// We need to acquire a lease on a "fresh" descriptor, meaning that joining
   910  	// a potential in-progress lease acquisition is generally not good enough.
   911  	// If we are to join an in-progress acquisition, it needs to be an acquisition
   912  	// initiated after this point.
   913  	// So, we handle two cases:
   914  	// 1. The first DoChan() call tells us that we didn't join an in-progress
   915  	//     acquisition. Great, the lease that's being acquired is good.
   916  	// 2. The first DoChan() call tells us that we did join an in-progress acq.
   917  	//     We have to wait this acquisition out; it's not good for us. But any
   918  	//     future acquisition is good, so the next time around the loop it doesn't
   919  	//     matter if we initiate a request or join an in-progress one.
   920  	// In both cases, we need to check if the lease we want is still valid because
   921  	// lease acquisition is done without holding the tableState lock, so anything
   922  	// can happen in between lease acquisition and us getting control again.
   923  	attemptsMade := 0
   924  	for {
   925  		// Acquire a fresh table lease.
   926  		didAcquire, err := acquireNodeLease(ctx, m, tableID)
   927  		if m.testingKnobs.LeaseStoreTestingKnobs.LeaseAcquireResultBlockEvent != nil {
   928  			m.testingKnobs.LeaseStoreTestingKnobs.LeaseAcquireResultBlockEvent(AcquireFreshestBlock)
   929  		}
   930  		if err != nil {
   931  			return err
   932  		}
   933  
   934  		if didAcquire {
   935  			// Case 1: we didn't join an in-progress call and the lease is still
   936  			// valid.
   937  			break
   938  		} else if attemptsMade > 1 {
   939  			// Case 2: more than one acquisition has happened and the lease is still
   940  			// valid.
   941  			break
   942  		}
   943  		attemptsMade++
   944  	}
   945  	return nil
   946  }
   947  
   948  // upsertLocked inserts a lease for a particular table version.
   949  // If an existing lease exists for the table version it replaces
   950  // it and returns it.
   951  func (t *tableState) upsertLocked(
   952  	ctx context.Context, table *tableVersionState,
   953  ) (_ *storedTableLease, _ error) {
   954  	s := t.mu.active.find(table.Version)
   955  	if s == nil {
   956  		if t.mu.active.findNewest() != nil {
   957  			log.Infof(ctx, "new lease: %s", table)
   958  		}
   959  		t.mu.active.insert(table)
   960  		return nil, nil
   961  	}
   962  
   963  	// The table is replacing an existing one at the same version.
   964  	if !table.hasValidExpiration(s) {
   965  		// This is a violation of an invariant and can actually not
   966  		// happen. We return an error here to aid in further investigations.
   967  		return nil, errors.Errorf("lease expiration monotonicity violation, (%s) vs (%s)", s, table)
   968  	}
   969  
   970  	s.mu.Lock()
   971  	table.mu.Lock()
   972  	// subsume the refcount of the older lease. This is permitted because
   973  	// the new lease has a greater expiration than the older lease and
   974  	// any transaction using the older lease can safely use a deadline set
   975  	// to the older lease's expiration even though the older lease is
   976  	// released! This is because the new lease is valid at the same table
   977  	// version at a greater expiration.
   978  	table.mu.refcount += s.mu.refcount
   979  	s.mu.refcount = 0
   980  	l := s.mu.lease
   981  	s.mu.lease = nil
   982  	if log.V(2) {
   983  		log.VEventf(ctx, 2, "replaced lease: %s with %s", s.stringLocked(), table.stringLocked())
   984  	}
   985  	table.mu.Unlock()
   986  	s.mu.Unlock()
   987  	t.mu.active.remove(s)
   988  	t.mu.active.insert(table)
   989  	return l, nil
   990  }
   991  
   992  // removeInactiveVersions removes inactive versions in t.mu.active.data with refcount 0.
   993  // t.mu must be locked. It returns table version state that need to be released.
   994  func (t *tableState) removeInactiveVersions() []*storedTableLease {
   995  	var leases []*storedTableLease
   996  	// A copy of t.mu.active.data must be made since t.mu.active.data will be changed
   997  	// within the loop.
   998  	for _, table := range append([]*tableVersionState(nil), t.mu.active.data...) {
   999  		func() {
  1000  			table.mu.Lock()
  1001  			defer table.mu.Unlock()
  1002  			if table.mu.refcount == 0 {
  1003  				t.mu.active.remove(table)
  1004  				if l := table.mu.lease; l != nil {
  1005  					table.mu.lease = nil
  1006  					leases = append(leases, l)
  1007  				}
  1008  			}
  1009  		}()
  1010  	}
  1011  	return leases
  1012  }
  1013  
  1014  // If the lease cannot be obtained because the descriptor is in the process of
  1015  // being dropped or offline, the error will be of type inactiveTableError.
  1016  // The boolean returned is true if this call was actually responsible for the
  1017  // lease acquisition.
  1018  func acquireNodeLease(ctx context.Context, m *Manager, id sqlbase.ID) (bool, error) {
  1019  	var toRelease *storedTableLease
  1020  	resultChan, didAcquire := m.group.DoChan(fmt.Sprintf("acquire%d", id), func() (interface{}, error) {
  1021  		// Note that we use a new `context` here to avoid a situation where a cancellation
  1022  		// of the first context cancels other callers to the `acquireNodeLease()` method,
  1023  		// because of its use of `singleflight.Group`. See issue #41780 for how this has
  1024  		// happened.
  1025  		newCtx, cancel := m.stopper.WithCancelOnQuiesce(logtags.WithTags(context.Background(), logtags.FromContext(ctx)))
  1026  		defer cancel()
  1027  		if m.isDraining() {
  1028  			return nil, errors.New("cannot acquire lease when draining")
  1029  		}
  1030  		newest := m.findNewest(id)
  1031  		var minExpiration hlc.Timestamp
  1032  		if newest != nil {
  1033  			minExpiration = newest.expiration
  1034  		}
  1035  		table, err := m.Storage.acquire(newCtx, minExpiration, id)
  1036  		if err != nil {
  1037  			return nil, err
  1038  		}
  1039  		t := m.findTableState(id, false /* create */)
  1040  		t.mu.Lock()
  1041  		defer t.mu.Unlock()
  1042  		toRelease, err = t.upsertLocked(newCtx, table)
  1043  		if err != nil {
  1044  			return nil, err
  1045  		}
  1046  		m.tableNames.insert(table)
  1047  		if toRelease != nil {
  1048  			releaseLease(toRelease, m)
  1049  		}
  1050  		return leaseToken(table), nil
  1051  	})
  1052  	select {
  1053  	case <-ctx.Done():
  1054  		return false, ctx.Err()
  1055  	case result := <-resultChan:
  1056  		if result.Err != nil {
  1057  			return false, result.Err
  1058  		}
  1059  	}
  1060  	return didAcquire, nil
  1061  }
  1062  
  1063  // release returns a tableVersionState that needs to be released from
  1064  // the store.
  1065  func (t *tableState) release(
  1066  	table *sqlbase.ImmutableTableDescriptor, removeOnceDereferenced bool,
  1067  ) (*storedTableLease, error) {
  1068  	t.mu.Lock()
  1069  	defer t.mu.Unlock()
  1070  
  1071  	s := t.mu.active.find(table.Version)
  1072  	if s == nil {
  1073  		return nil, errors.Errorf("table %d version %d not found", table.ID, table.Version)
  1074  	}
  1075  	// Decrements the refcount and returns true if the lease has to be removed
  1076  	// from the store.
  1077  	decRefcount := func(s *tableVersionState) *storedTableLease {
  1078  		// Figure out if we'd like to remove the lease from the store asap (i.e.
  1079  		// when the refcount drops to 0). If so, we'll need to mark the lease as
  1080  		// invalid.
  1081  		removeOnceDereferenced = removeOnceDereferenced ||
  1082  			// Release from the store if the table has been dropped; no leases
  1083  			// can be acquired any more.
  1084  			t.mu.dropped ||
  1085  			// Release from the store if the lease is not for the latest
  1086  			// version; only leases for the latest version can be acquired.
  1087  			s != t.mu.active.findNewest()
  1088  
  1089  		s.mu.Lock()
  1090  		defer s.mu.Unlock()
  1091  		s.mu.refcount--
  1092  		if log.V(2) {
  1093  			log.VEventf(context.TODO(), 2, "release: %s", s.stringLocked())
  1094  		}
  1095  		if s.mu.refcount < 0 {
  1096  			panic(fmt.Sprintf("negative ref count: %s", s))
  1097  		}
  1098  
  1099  		if s.mu.refcount == 0 && s.mu.lease != nil && removeOnceDereferenced {
  1100  			l := s.mu.lease
  1101  			s.mu.lease = nil
  1102  			return l
  1103  		}
  1104  		return nil
  1105  	}
  1106  	if l := decRefcount(s); l != nil {
  1107  		t.mu.active.remove(s)
  1108  		return l, nil
  1109  	}
  1110  	return nil, nil
  1111  }
  1112  
  1113  // releaseLease from store.
  1114  func releaseLease(lease *storedTableLease, m *Manager) {
  1115  	ctx := context.TODO()
  1116  	if m.isDraining() {
  1117  		// Release synchronously to guarantee release before exiting.
  1118  		m.Storage.release(ctx, m.stopper, lease)
  1119  		return
  1120  	}
  1121  
  1122  	// Release to the store asynchronously, without the tableState lock.
  1123  	if err := m.stopper.RunAsyncTask(
  1124  		ctx, "sql.tableState: releasing descriptor lease",
  1125  		func(ctx context.Context) {
  1126  			m.Storage.release(ctx, m.stopper, lease)
  1127  		}); err != nil {
  1128  		log.Warningf(ctx, "error: %s, not releasing lease: %q", err, lease)
  1129  	}
  1130  }
  1131  
  1132  // purgeOldVersions removes old unused table descriptor versions older than
  1133  // minVersion and releases any associated leases.
  1134  // If takenOffline is set, minVersion is ignored; no lease is acquired and all
  1135  // existing unused versions are removed. The table is further marked dropped,
  1136  // which will cause existing in-use leases to be eagerly released once
  1137  // they're not in use any more.
  1138  // If t has no active leases, nothing is done.
  1139  func purgeOldVersions(
  1140  	ctx context.Context,
  1141  	db *kv.DB,
  1142  	id sqlbase.ID,
  1143  	takenOffline bool,
  1144  	minVersion sqlbase.DescriptorVersion,
  1145  	m *Manager,
  1146  ) error {
  1147  	t := m.findTableState(id, false /*create*/)
  1148  	if t == nil {
  1149  		return nil
  1150  	}
  1151  	t.mu.Lock()
  1152  	empty := len(t.mu.active.data) == 0 && t.mu.acquisitionsInProgress == 0
  1153  	t.mu.Unlock()
  1154  	if empty {
  1155  		// We don't currently have a version on this table, so no need to refresh
  1156  		// anything.
  1157  		return nil
  1158  	}
  1159  
  1160  	removeInactives := func(drop bool) {
  1161  		t.mu.Lock()
  1162  		t.mu.dropped = drop
  1163  		leases := t.removeInactiveVersions()
  1164  		t.mu.Unlock()
  1165  		for _, l := range leases {
  1166  			releaseLease(l, m)
  1167  		}
  1168  	}
  1169  
  1170  	if takenOffline {
  1171  		removeInactives(takenOffline)
  1172  		return nil
  1173  	}
  1174  
  1175  	if err := ensureVersion(ctx, id, minVersion, m); err != nil {
  1176  		return err
  1177  	}
  1178  
  1179  	// Acquire a refcount on the table on the latest version to maintain an
  1180  	// active lease, so that it doesn't get released when removeInactives()
  1181  	// is called below. Release this lease after calling removeInactives().
  1182  	table, _, err := t.findForTimestamp(ctx, m.clock.Now())
  1183  	if isInactive := sqlbase.HasInactiveTableError(err); err == nil || isInactive {
  1184  		removeInactives(isInactive)
  1185  		if table != nil {
  1186  			s, err := t.release(&table.ImmutableTableDescriptor, m.removeOnceDereferenced())
  1187  			if err != nil {
  1188  				return err
  1189  			}
  1190  			if s != nil {
  1191  				releaseLease(s, m)
  1192  			}
  1193  			return nil
  1194  		}
  1195  		return nil
  1196  	}
  1197  	return err
  1198  }
  1199  
  1200  // maybeQueueLeaseRenewal queues a lease renewal if there is not already a lease
  1201  // renewal in progress.
  1202  func (t *tableState) maybeQueueLeaseRenewal(
  1203  	ctx context.Context, m *Manager, tableID sqlbase.ID, tableName string,
  1204  ) error {
  1205  	if !atomic.CompareAndSwapInt32(&t.renewalInProgress, 0, 1) {
  1206  		return nil
  1207  	}
  1208  
  1209  	// Start the renewal. When it finishes, it will reset t.renewalInProgress.
  1210  	return t.stopper.RunAsyncTask(context.Background(),
  1211  		"lease renewal", func(ctx context.Context) {
  1212  			var cleanup func()
  1213  			ctx, cleanup = tracing.EnsureContext(ctx, m.ambientCtx.Tracer, "lease renewal")
  1214  			defer cleanup()
  1215  			t.startLeaseRenewal(ctx, m, tableID, tableName)
  1216  		})
  1217  }
  1218  
  1219  // startLeaseRenewal starts a singleflight.Group to acquire a lease.
  1220  // This function blocks until lease acquisition completes.
  1221  // t.renewalInProgress must be set to 1 before calling.
  1222  func (t *tableState) startLeaseRenewal(
  1223  	ctx context.Context, m *Manager, tableID sqlbase.ID, tableName string,
  1224  ) {
  1225  	log.VEventf(ctx, 1,
  1226  		"background lease renewal beginning for tableID=%d tableName=%q",
  1227  		tableID, tableName)
  1228  	if _, err := acquireNodeLease(ctx, m, tableID); err != nil {
  1229  		log.Errorf(ctx,
  1230  			"background lease renewal for tableID=%d tableName=%q failed: %s",
  1231  			tableID, tableName, err)
  1232  	} else {
  1233  		log.VEventf(ctx, 1,
  1234  			"background lease renewal finished for tableID=%d tableName=%q",
  1235  			tableID, tableName)
  1236  	}
  1237  	atomic.StoreInt32(&t.renewalInProgress, 0)
  1238  }
  1239  
  1240  // markAcquisitionStart increments the acquisitionsInProgress counter.
  1241  func (t *tableState) markAcquisitionStart(ctx context.Context) {
  1242  	t.mu.Lock()
  1243  	defer t.mu.Unlock()
  1244  	t.mu.acquisitionsInProgress++
  1245  }
  1246  
  1247  // markAcquisitionDone decrements the acquisitionsInProgress counter.
  1248  func (t *tableState) markAcquisitionDone(ctx context.Context) {
  1249  	t.mu.Lock()
  1250  	defer t.mu.Unlock()
  1251  	t.mu.acquisitionsInProgress--
  1252  }
  1253  
  1254  // AcquireBlockType is the type of blocking result event when
  1255  // calling LeaseAcquireResultBlockEvent.
  1256  type AcquireBlockType int
  1257  
  1258  const (
  1259  	// AcquireBlock denotes the LeaseAcquireResultBlockEvent is
  1260  	// coming from tableState.acquire().
  1261  	AcquireBlock AcquireBlockType = iota
  1262  	// AcquireFreshestBlock denotes the LeaseAcquireResultBlockEvent is
  1263  	// from tableState.acquireFreshestFromStore().
  1264  	AcquireFreshestBlock
  1265  )
  1266  
  1267  // StorageTestingKnobs contains testing knobs.
  1268  type StorageTestingKnobs struct {
  1269  	// Called after a lease is removed from the store, with any operation error.
  1270  	// See LeaseRemovalTracker.
  1271  	LeaseReleasedEvent func(id sqlbase.ID, version sqlbase.DescriptorVersion, err error)
  1272  	// Called after a lease is acquired, with any operation error.
  1273  	LeaseAcquiredEvent func(table sqlbase.TableDescriptor, err error)
  1274  	// Called before waiting on a results from a DoChan call of acquireNodeLease
  1275  	// in tableState.acquire() and tableState.acquireFreshestFromStore().
  1276  	LeaseAcquireResultBlockEvent func(leaseBlockType AcquireBlockType)
  1277  	// RemoveOnceDereferenced forces leases to be removed
  1278  	// as soon as they are dereferenced.
  1279  	RemoveOnceDereferenced bool
  1280  }
  1281  
  1282  // ModuleTestingKnobs is part of the base.ModuleTestingKnobs interface.
  1283  func (*StorageTestingKnobs) ModuleTestingKnobs() {}
  1284  
  1285  var _ base.ModuleTestingKnobs = &StorageTestingKnobs{}
  1286  
  1287  // ManagerTestingKnobs contains test knobs.
  1288  type ManagerTestingKnobs struct {
  1289  
  1290  	// A callback called after the leases are refreshed as a result of a gossip update.
  1291  	TestingTableRefreshedEvent func(descriptor *sqlbase.TableDescriptor)
  1292  
  1293  	// TestingTableUpdateEvent is a callback when an update is received, before
  1294  	// the leases are refreshed. If a non-nil error is returned, the update is
  1295  	// ignored.
  1296  	TestingTableUpdateEvent func(descriptor *sqlbase.TableDescriptor) error
  1297  
  1298  	// To disable the deletion of orphaned leases at server startup.
  1299  	DisableDeleteOrphanedLeases bool
  1300  
  1301  	// AlwaysUseRangefeeds ensures that rangefeeds and not gossip are used to
  1302  	// detect changes to table descriptors.
  1303  	AlwaysUseRangefeeds bool
  1304  
  1305  	// VersionPollIntervalForRangefeeds controls the polling interval for the
  1306  	// check whether the requisite version for rangefeed-based notifications has
  1307  	// been finalized.
  1308  	//
  1309  	// TODO(ajwerner): Remove this and replace it with a callback.
  1310  	VersionPollIntervalForRangefeeds time.Duration
  1311  
  1312  	LeaseStoreTestingKnobs StorageTestingKnobs
  1313  }
  1314  
  1315  var _ base.ModuleTestingKnobs = &ManagerTestingKnobs{}
  1316  
  1317  // ModuleTestingKnobs is part of the base.ModuleTestingKnobs interface.
  1318  func (*ManagerTestingKnobs) ModuleTestingKnobs() {}
  1319  
  1320  type tableNameCacheKey struct {
  1321  	dbID                sqlbase.ID
  1322  	schemaID            sqlbase.ID
  1323  	normalizeTabledName string
  1324  }
  1325  
  1326  // tableNameCache is a cache of table name -> latest table version mappings.
  1327  // The Manager updates the cache every time a lease is acquired or released
  1328  // from the store. The cache maintains the latest version for each table name.
  1329  // All methods are thread-safe.
  1330  type tableNameCache struct {
  1331  	mu     syncutil.Mutex
  1332  	tables map[tableNameCacheKey]*tableVersionState
  1333  }
  1334  
  1335  // Resolves a (database ID, table name) to the table descriptor's ID.
  1336  // Returns a valid tableVersionState for the table with that name,
  1337  // if the name had been previously cached and the cache has a table
  1338  // version that has not expired. Returns nil otherwise.
  1339  // This method handles normalizing the table name.
  1340  // The table's refcount is incremented before returning, so the caller
  1341  // is responsible for releasing it to the leaseManager.
  1342  func (c *tableNameCache) get(
  1343  	dbID sqlbase.ID, schemaID sqlbase.ID, tableName string, timestamp hlc.Timestamp,
  1344  ) *tableVersionState {
  1345  	c.mu.Lock()
  1346  	table, ok := c.tables[makeTableNameCacheKey(dbID, schemaID, tableName)]
  1347  	c.mu.Unlock()
  1348  	if !ok {
  1349  		return nil
  1350  	}
  1351  	table.mu.Lock()
  1352  	if table.mu.lease == nil {
  1353  		table.mu.Unlock()
  1354  		// This get() raced with a release operation. Remove this cache
  1355  		// entry if needed.
  1356  		c.remove(table)
  1357  		return nil
  1358  	}
  1359  
  1360  	defer table.mu.Unlock()
  1361  
  1362  	if !NameMatchesTable(
  1363  		&table.ImmutableTableDescriptor.TableDescriptor,
  1364  		dbID,
  1365  		schemaID,
  1366  		tableName,
  1367  	) {
  1368  		panic(fmt.Sprintf("Out of sync entry in the name cache. "+
  1369  			"Cache entry: %d.%q -> %d. Lease: %d.%q.",
  1370  			dbID, tableName, table.ID, table.ParentID, table.Name))
  1371  	}
  1372  
  1373  	// Expired table. Don't hand it out.
  1374  	if table.hasExpired(timestamp) {
  1375  		return nil
  1376  	}
  1377  
  1378  	table.incRefcountLocked()
  1379  	return table
  1380  }
  1381  
  1382  func (c *tableNameCache) insert(table *tableVersionState) {
  1383  	c.mu.Lock()
  1384  	defer c.mu.Unlock()
  1385  
  1386  	key := makeTableNameCacheKey(table.ParentID, table.GetParentSchemaID(), table.Name)
  1387  	existing, ok := c.tables[key]
  1388  	if !ok {
  1389  		c.tables[key] = table
  1390  		return
  1391  	}
  1392  	// If we already have a lease in the cache for this name, see if this one is
  1393  	// better (higher version or later expiration).
  1394  	if table.Version > existing.Version ||
  1395  		(table.Version == existing.Version && table.hasValidExpiration(existing)) {
  1396  		// Overwrite the old table. The new one is better. From now on, we want
  1397  		// clients to use the new one.
  1398  		c.tables[key] = table
  1399  	}
  1400  }
  1401  
  1402  func (c *tableNameCache) remove(table *tableVersionState) {
  1403  	c.mu.Lock()
  1404  	defer c.mu.Unlock()
  1405  
  1406  	key := makeTableNameCacheKey(table.ParentID, table.GetParentSchemaID(), table.Name)
  1407  	existing, ok := c.tables[key]
  1408  	if !ok {
  1409  		// Table for lease not found in table name cache. This can happen if we had
  1410  		// a more recent lease on the table in the tableNameCache, then the table
  1411  		// gets dropped, then the more recent lease is remove()d - which clears the
  1412  		// cache.
  1413  		return
  1414  	}
  1415  	// If this was the lease that the cache had for the table name, remove it.
  1416  	// If the cache had some other table, this remove is a no-op.
  1417  	if existing == table {
  1418  		delete(c.tables, key)
  1419  	}
  1420  }
  1421  
  1422  func makeTableNameCacheKey(
  1423  	dbID sqlbase.ID, schemaID sqlbase.ID, tableName string,
  1424  ) tableNameCacheKey {
  1425  	return tableNameCacheKey{dbID, schemaID, tableName}
  1426  }
  1427  
  1428  // Manager manages acquiring and releasing per-table leases. It also
  1429  // handles resolving table names to descriptor IDs. The leases are managed
  1430  // internally with a table descriptor and expiration time exported by the
  1431  // API. The table descriptor acquired needs to be released. A transaction
  1432  // can use a table descriptor as long as its timestamp is within the
  1433  // validity window for the descriptor:
  1434  // descriptor.ModificationTime <= txn.Timestamp < expirationTime
  1435  //
  1436  // Exported only for testing.
  1437  //
  1438  // The locking order is:
  1439  // Manager.mu > tableState.mu > tableNameCache.mu > tableVersionState.mu
  1440  type Manager struct {
  1441  	Storage
  1442  	mu struct {
  1443  		syncutil.Mutex
  1444  		tables map[sqlbase.ID]*tableState
  1445  
  1446  		// updatesResolvedTimestamp keeps track of a timestamp before which all
  1447  		// table updates have already been seen.
  1448  		updatesResolvedTimestamp hlc.Timestamp
  1449  	}
  1450  
  1451  	draining atomic.Value
  1452  
  1453  	// tableNames is a cache for name -> id mappings. A mapping for the cache
  1454  	// should only be used if we currently have an active lease on the respective
  1455  	// id; otherwise, the mapping may well be stale.
  1456  	// Not protected by mu.
  1457  	tableNames   tableNameCache
  1458  	testingKnobs ManagerTestingKnobs
  1459  	ambientCtx   log.AmbientContext
  1460  	stopper      *stop.Stopper
  1461  	sem          *quotapool.IntPool
  1462  }
  1463  
  1464  const leaseConcurrencyLimit = 5
  1465  
  1466  // NewLeaseManager creates a new Manager.
  1467  //
  1468  // internalExecutor can be nil to help bootstrapping, but then it needs to be set via
  1469  // SetInternalExecutor before the Manager is used.
  1470  //
  1471  // stopper is used to run async tasks. Can be nil in tests.
  1472  func NewLeaseManager(
  1473  	ambientCtx log.AmbientContext,
  1474  	nodeIDContainer *base.SQLIDContainer,
  1475  	db *kv.DB,
  1476  	clock *hlc.Clock,
  1477  	internalExecutor sqlutil.InternalExecutor,
  1478  	settings *cluster.Settings,
  1479  	codec keys.SQLCodec,
  1480  	testingKnobs ManagerTestingKnobs,
  1481  	stopper *stop.Stopper,
  1482  	cfg *base.LeaseManagerConfig,
  1483  ) *Manager {
  1484  	lm := &Manager{
  1485  		Storage: Storage{
  1486  			nodeIDContainer:     nodeIDContainer,
  1487  			db:                  db,
  1488  			clock:               clock,
  1489  			internalExecutor:    internalExecutor,
  1490  			settings:            settings,
  1491  			codec:               codec,
  1492  			group:               &singleflight.Group{},
  1493  			leaseDuration:       cfg.TableDescriptorLeaseDuration,
  1494  			leaseJitterFraction: cfg.TableDescriptorLeaseJitterFraction,
  1495  			leaseRenewalTimeout: cfg.TableDescriptorLeaseRenewalTimeout,
  1496  			testingKnobs:        testingKnobs.LeaseStoreTestingKnobs,
  1497  		},
  1498  		testingKnobs: testingKnobs,
  1499  		tableNames: tableNameCache{
  1500  			tables: make(map[tableNameCacheKey]*tableVersionState),
  1501  		},
  1502  		ambientCtx: ambientCtx,
  1503  		stopper:    stopper,
  1504  		sem:        quotapool.NewIntPool("lease manager", leaseConcurrencyLimit),
  1505  	}
  1506  	lm.stopper.AddCloser(lm.sem.Closer("stopper"))
  1507  	lm.mu.tables = make(map[sqlbase.ID]*tableState)
  1508  	lm.mu.updatesResolvedTimestamp = db.Clock().Now()
  1509  
  1510  	lm.draining.Store(false)
  1511  	return lm
  1512  }
  1513  
  1514  // NameMatchesTable returns true if the provided name and IDs match this
  1515  // descriptor.
  1516  func NameMatchesTable(
  1517  	table *sqlbase.TableDescriptor, dbID sqlbase.ID, schemaID sqlbase.ID, tableName string,
  1518  ) bool {
  1519  	return table.ParentID == dbID && table.Name == tableName &&
  1520  		table.GetParentSchemaID() == schemaID
  1521  }
  1522  
  1523  // findNewest returns the newest table version state for the tableID.
  1524  func (m *Manager) findNewest(tableID sqlbase.ID) *tableVersionState {
  1525  	t := m.findTableState(tableID, false /* create */)
  1526  	t.mu.Lock()
  1527  	defer t.mu.Unlock()
  1528  	return t.mu.active.findNewest()
  1529  }
  1530  
  1531  // AcquireByName returns a table version for the specified table valid for
  1532  // the timestamp. It returns the table descriptor and a expiration time.
  1533  // A transaction using this descriptor must ensure that its
  1534  // commit-timestamp < expiration-time. Care must be taken to not modify
  1535  // the returned descriptor. Renewal of a lease may begin in the
  1536  // background. Renewal is done in order to prevent blocking on future
  1537  // acquisitions.
  1538  //
  1539  // Known limitation: AcquireByName() calls Acquire() and therefore suffers
  1540  // from the same limitation as Acquire (See Acquire). AcquireByName() is
  1541  // unable to function correctly on a timestamp less than the timestamp
  1542  // of a transaction with a DROP/TRUNCATE on a table. The limitation in
  1543  // the face of a DROP follows directly from the limitation on Acquire().
  1544  // A TRUNCATE is implemented by changing the name -> id mapping for a table
  1545  // and by dropping the descriptor with the old id. While AcquireByName
  1546  // can use the timestamp and get the correct name->id  mapping at a
  1547  // timestamp, it uses Acquire() to get a descriptor with the corresponding
  1548  // id and fails because the id has been dropped by the TRUNCATE.
  1549  func (m *Manager) AcquireByName(
  1550  	ctx context.Context,
  1551  	timestamp hlc.Timestamp,
  1552  	dbID sqlbase.ID,
  1553  	schemaID sqlbase.ID,
  1554  	tableName string,
  1555  ) (*sqlbase.ImmutableTableDescriptor, hlc.Timestamp, error) {
  1556  	// Check if we have cached an ID for this name.
  1557  	tableVersion := m.tableNames.get(dbID, schemaID, tableName, timestamp)
  1558  	if tableVersion != nil {
  1559  		if tableVersion.ModificationTime.LessEq(timestamp) {
  1560  			// If this lease is nearly expired, ensure a renewal is queued.
  1561  			durationUntilExpiry := time.Duration(tableVersion.expiration.WallTime - timestamp.WallTime)
  1562  			if durationUntilExpiry < m.Storage.leaseRenewalTimeout {
  1563  				if t := m.findTableState(tableVersion.ID, false /* create */); t != nil {
  1564  					if err := t.maybeQueueLeaseRenewal(
  1565  						ctx, m, tableVersion.ID, tableName); err != nil {
  1566  						return nil, hlc.Timestamp{}, err
  1567  					}
  1568  				}
  1569  			}
  1570  			return &tableVersion.ImmutableTableDescriptor, tableVersion.expiration, nil
  1571  		}
  1572  		if err := m.Release(&tableVersion.ImmutableTableDescriptor); err != nil {
  1573  			return nil, hlc.Timestamp{}, err
  1574  		}
  1575  		// Return a valid table descriptor for the timestamp.
  1576  		table, expiration, err := m.Acquire(ctx, timestamp, tableVersion.ID)
  1577  		if err != nil {
  1578  			return nil, hlc.Timestamp{}, err
  1579  		}
  1580  		return table, expiration, nil
  1581  	}
  1582  
  1583  	// We failed to find something in the cache, or what we found is not
  1584  	// guaranteed to be valid by the time we use it because we don't have a
  1585  	// lease with at least a bit of lifetime left in it. So, we do it the hard
  1586  	// way: look in the database to resolve the name, then acquire a new table.
  1587  	var err error
  1588  	tableID, err := m.resolveName(ctx, timestamp, dbID, schemaID, tableName)
  1589  	if err != nil {
  1590  		return nil, hlc.Timestamp{}, err
  1591  	}
  1592  	table, expiration, err := m.Acquire(ctx, timestamp, tableID)
  1593  	if err != nil {
  1594  		return nil, hlc.Timestamp{}, err
  1595  	}
  1596  	if !NameMatchesTable(&table.TableDescriptor, dbID, schemaID, tableName) {
  1597  		// We resolved name `tableName`, but the lease has a different name in it.
  1598  		// That can mean two things. Assume the table is being renamed from A to B.
  1599  		// a) `tableName` is A. The transaction doing the RENAME committed (so the
  1600  		// descriptor has been updated to B), but its schema changer has not
  1601  		// finished yet. B is the new name of the table, queries should use that. If
  1602  		// we already had a lease with name A, we would've allowed to use it (but we
  1603  		// don't, otherwise the cache lookup above would've given it to us).  Since
  1604  		// we don't, let's not allow A to be used, given that the lease now has name
  1605  		// B in it. It'd be sketchy to allow A to be used with an inconsistent name
  1606  		// in the table.
  1607  		//
  1608  		// b) `tableName` is B. Like in a), the transaction doing the RENAME
  1609  		// committed (so the descriptor has been updated to B), but its schema
  1610  		// change has not finished yet. We still had a valid lease with name A in
  1611  		// it. What to do, what to do? We could allow name B to be used, but who
  1612  		// knows what consequences that would have, since its not consistent with
  1613  		// the table. We could say "table B not found", but that means that, until
  1614  		// the next gossip update, this node would not service queries for this
  1615  		// table under the name B. That's no bueno, as B should be available to be
  1616  		// used immediately after the RENAME transaction has committed.
  1617  		// The problem is that we have a lease that we know is stale (the descriptor
  1618  		// in the DB doesn't necessarily have a new version yet, but it definitely
  1619  		// has a new name). So, lets force getting a fresh table.
  1620  		// This case (modulo the "committed" part) also applies when the txn doing a
  1621  		// RENAME had a lease on the old name, and then tries to use the new name
  1622  		// after the RENAME statement.
  1623  		//
  1624  		// How do we disambiguate between the a) and b)? We get a fresh lease on
  1625  		// the descriptor, as required by b), and then we'll know if we're trying to
  1626  		// resolve the current or the old name.
  1627  		//
  1628  		// TODO(vivek): check if the entire above comment is indeed true. Review the
  1629  		// use of NameMatchesTable() throughout this function.
  1630  		if err := m.Release(table); err != nil {
  1631  			log.Warningf(ctx, "error releasing lease: %s", err)
  1632  		}
  1633  		if err := m.AcquireFreshestFromStore(ctx, tableID); err != nil {
  1634  			return nil, hlc.Timestamp{}, err
  1635  		}
  1636  		table, expiration, err = m.Acquire(ctx, timestamp, tableID)
  1637  		if err != nil {
  1638  			return nil, hlc.Timestamp{}, err
  1639  		}
  1640  		if !NameMatchesTable(&table.TableDescriptor, dbID, schemaID, tableName) {
  1641  			// If the name we had doesn't match the newest descriptor in the DB, then
  1642  			// we're trying to use an old name.
  1643  			if err := m.Release(table); err != nil {
  1644  				log.Warningf(ctx, "error releasing lease: %s", err)
  1645  			}
  1646  			return nil, hlc.Timestamp{}, sqlbase.ErrDescriptorNotFound
  1647  		}
  1648  	}
  1649  	return table, expiration, nil
  1650  }
  1651  
  1652  // resolveName resolves a table name to a descriptor ID at a particular
  1653  // timestamp by looking in the database. If the mapping is not found,
  1654  // sqlbase.ErrDescriptorNotFound is returned.
  1655  func (m *Manager) resolveName(
  1656  	ctx context.Context,
  1657  	timestamp hlc.Timestamp,
  1658  	dbID sqlbase.ID,
  1659  	schemaID sqlbase.ID,
  1660  	tableName string,
  1661  ) (sqlbase.ID, error) {
  1662  	id := sqlbase.InvalidID
  1663  	if err := m.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
  1664  		// Run the name lookup as high-priority, thereby pushing any intents out of
  1665  		// its way. We don't want schema changes to prevent name resolution/lease
  1666  		// acquisitions; we'd rather force them to refresh. Also this prevents
  1667  		// deadlocks in cases where the name resolution is triggered by the
  1668  		// transaction doing the schema change itself.
  1669  		if err := txn.SetUserPriority(roachpb.MaxUserPriority); err != nil {
  1670  			return err
  1671  		}
  1672  		txn.SetFixedTimestamp(ctx, timestamp)
  1673  		var found bool
  1674  		var err error
  1675  		found, id, err = sqlbase.LookupObjectID(ctx, txn, m.codec, dbID, schemaID, tableName)
  1676  		if err != nil {
  1677  			return err
  1678  		}
  1679  		if !found {
  1680  			return nil
  1681  		}
  1682  		return nil
  1683  	}); err != nil {
  1684  		return id, err
  1685  	}
  1686  	if id == sqlbase.InvalidID {
  1687  		return id, sqlbase.ErrDescriptorNotFound
  1688  	}
  1689  	return id, nil
  1690  }
  1691  
  1692  // Acquire acquires a read lease for the specified table ID valid for
  1693  // the timestamp. It returns the table descriptor and a expiration time.
  1694  // A transaction using this descriptor must ensure that its
  1695  // commit-timestamp < expiration-time. Care must be taken to not modify
  1696  // the returned descriptor.
  1697  //
  1698  // Known limitation: Acquire() can return an error after the table with
  1699  // the tableID has been dropped. This is true even when using a timestamp
  1700  // less than the timestamp of the DROP command. This is because Acquire
  1701  // can only return an older version of a descriptor if the latest version
  1702  // can be leased; as it stands a dropped table cannot be leased.
  1703  func (m *Manager) Acquire(
  1704  	ctx context.Context, timestamp hlc.Timestamp, tableID sqlbase.ID,
  1705  ) (*sqlbase.ImmutableTableDescriptor, hlc.Timestamp, error) {
  1706  	for {
  1707  		t := m.findTableState(tableID, true /*create*/)
  1708  		table, latest, err := t.findForTimestamp(ctx, timestamp)
  1709  		if err == nil {
  1710  			// If the latest lease is nearly expired, ensure a renewal is queued.
  1711  			if latest {
  1712  				durationUntilExpiry := time.Duration(table.expiration.WallTime - timestamp.WallTime)
  1713  				if durationUntilExpiry < m.Storage.leaseRenewalTimeout {
  1714  					if err := t.maybeQueueLeaseRenewal(ctx, m, tableID, table.Name); err != nil {
  1715  						return nil, hlc.Timestamp{}, err
  1716  					}
  1717  				}
  1718  			}
  1719  			return &table.ImmutableTableDescriptor, table.expiration, nil
  1720  		}
  1721  		switch {
  1722  		case errors.Is(err, errRenewLease):
  1723  			if err := func() error {
  1724  				t.markAcquisitionStart(ctx)
  1725  				defer t.markAcquisitionDone(ctx)
  1726  				// Renew lease and retry. This will block until the lease is acquired.
  1727  				_, errLease := acquireNodeLease(ctx, m, tableID)
  1728  				return errLease
  1729  			}(); err != nil {
  1730  				return nil, hlc.Timestamp{}, err
  1731  			}
  1732  
  1733  			if m.testingKnobs.LeaseStoreTestingKnobs.LeaseAcquireResultBlockEvent != nil {
  1734  				m.testingKnobs.LeaseStoreTestingKnobs.LeaseAcquireResultBlockEvent(AcquireBlock)
  1735  			}
  1736  
  1737  		case errors.Is(err, errReadOlderTableVersion):
  1738  			// Read old table versions from the store. This can block while reading
  1739  			// old table versions from the store.
  1740  			versions, errRead := m.readOlderVersionForTimestamp(ctx, tableID, timestamp)
  1741  			if errRead != nil {
  1742  				return nil, hlc.Timestamp{}, errRead
  1743  			}
  1744  			m.insertTableVersions(tableID, versions)
  1745  
  1746  		default:
  1747  			return nil, hlc.Timestamp{}, err
  1748  		}
  1749  	}
  1750  }
  1751  
  1752  // Release releases a previously acquired table.
  1753  func (m *Manager) Release(desc *sqlbase.ImmutableTableDescriptor) error {
  1754  	t := m.findTableState(desc.ID, false /* create */)
  1755  	if t == nil {
  1756  		return errors.Errorf("table %d not found", desc.ID)
  1757  	}
  1758  	// TODO(pmattis): Can/should we delete from Manager.tables if the
  1759  	// tableState becomes empty?
  1760  	// TODO(andrei): I think we never delete from Manager.tables... which
  1761  	// could be bad if a lot of tables keep being created. I looked into cleaning
  1762  	// up a bit, but it seems tricky to do with the current locking which is split
  1763  	// between Manager and tableState.
  1764  	l, err := t.release(desc, m.removeOnceDereferenced())
  1765  	if err != nil {
  1766  		return err
  1767  	}
  1768  	if l != nil {
  1769  		releaseLease(l, m)
  1770  	}
  1771  	return nil
  1772  }
  1773  
  1774  // removeOnceDereferenced returns true if the Manager thinks
  1775  // a tableVersionState can be removed after its refcount goes to 0.
  1776  func (m *Manager) removeOnceDereferenced() bool {
  1777  	return m.Storage.testingKnobs.RemoveOnceDereferenced ||
  1778  		// Release from the store if the Manager is draining.
  1779  		m.isDraining()
  1780  }
  1781  
  1782  func (m *Manager) isDraining() bool {
  1783  	return m.draining.Load().(bool)
  1784  }
  1785  
  1786  // SetDraining (when called with 'true') removes all inactive leases. Any leases
  1787  // that are active will be removed once the lease's reference count drops to 0.
  1788  //
  1789  // The reporter callback, if non-nil, is called on a best effort basis
  1790  // to report work that needed to be done and which may or may not have
  1791  // been done by the time this call returns. See the explanation in
  1792  // pkg/server/drain.go for details.
  1793  func (m *Manager) SetDraining(drain bool, reporter func(int, string)) {
  1794  	m.draining.Store(drain)
  1795  	if !drain {
  1796  		return
  1797  	}
  1798  
  1799  	m.mu.Lock()
  1800  	defer m.mu.Unlock()
  1801  	for _, t := range m.mu.tables {
  1802  		t.mu.Lock()
  1803  		leases := t.removeInactiveVersions()
  1804  		t.mu.Unlock()
  1805  		for _, l := range leases {
  1806  			releaseLease(l, m)
  1807  		}
  1808  		if reporter != nil {
  1809  			// Report progress through the Drain RPC.
  1810  			reporter(len(leases), "table leases")
  1811  		}
  1812  	}
  1813  }
  1814  
  1815  // If create is set, cache and stopper need to be set as well.
  1816  func (m *Manager) findTableState(tableID sqlbase.ID, create bool) *tableState {
  1817  	m.mu.Lock()
  1818  	defer m.mu.Unlock()
  1819  	t := m.mu.tables[tableID]
  1820  	if t == nil && create {
  1821  		t = &tableState{id: tableID, stopper: m.stopper}
  1822  		m.mu.tables[tableID] = t
  1823  	}
  1824  	return t
  1825  }
  1826  
  1827  // RefreshLeases starts a goroutine that refreshes the lease manager
  1828  // leases for tables received in the latest system configuration via gossip or
  1829  // rangefeeds. This function must be passed a non-nil gossip if
  1830  // VersionRangefeedLeases is not active.
  1831  func (m *Manager) RefreshLeases(
  1832  	ctx context.Context, s *stop.Stopper, db *kv.DB, g gossip.DeprecatedGossip,
  1833  ) {
  1834  	s.RunWorker(ctx, func(ctx context.Context) {
  1835  		m.refreshLeases(ctx, g, db, s)
  1836  	})
  1837  }
  1838  
  1839  func (m *Manager) refreshLeases(
  1840  	ctx context.Context, g gossip.DeprecatedGossip, db *kv.DB, s *stop.Stopper,
  1841  ) {
  1842  	tableUpdateCh := make(chan *sqlbase.TableDescriptor)
  1843  	m.watchForUpdates(ctx, s, db, g, tableUpdateCh)
  1844  	s.RunWorker(ctx, func(ctx context.Context) {
  1845  		for {
  1846  			select {
  1847  			case table := <-tableUpdateCh:
  1848  				// NB: We allow nil tables to be sent to synchronize the updating of
  1849  				// tables.
  1850  				if table == nil {
  1851  					continue
  1852  				}
  1853  
  1854  				if evFunc := m.testingKnobs.TestingTableUpdateEvent; evFunc != nil {
  1855  					if err := evFunc(table); err != nil {
  1856  						log.Infof(ctx, "skipping table update of %v due to knob: %v",
  1857  							table, err)
  1858  					}
  1859  				}
  1860  
  1861  				// Try to refresh the table lease to one >= this version.
  1862  				log.VEventf(ctx, 2, "purging old version of table %d@%d (offline %v)",
  1863  					table.ID, table.Version, table.GoingOffline())
  1864  				if err := purgeOldVersions(
  1865  					ctx, db, table.ID, table.GoingOffline(), table.Version, m); err != nil {
  1866  					log.Warningf(ctx, "error purging leases for table %d(%s): %s",
  1867  						table.ID, table.Name, err)
  1868  				}
  1869  
  1870  				if evFunc := m.testingKnobs.TestingTableRefreshedEvent; evFunc != nil {
  1871  					evFunc(table)
  1872  				}
  1873  
  1874  			case <-s.ShouldQuiesce():
  1875  				return
  1876  			}
  1877  		}
  1878  	})
  1879  }
  1880  
  1881  // watchForUpdates will watch either gossip or rangefeeds for updates. If the
  1882  // version does not currently support rangefeeds, gossip will be used until
  1883  // rangefeeds are supported, at which time, the system will shut down the
  1884  // gossip listener and start using rangefeeds.
  1885  func (m *Manager) watchForUpdates(
  1886  	ctx context.Context,
  1887  	s *stop.Stopper,
  1888  	db *kv.DB,
  1889  	g gossip.DeprecatedGossip,
  1890  	tableUpdateCh chan *sqlbase.TableDescriptor,
  1891  ) {
  1892  	useRangefeeds := m.testingKnobs.AlwaysUseRangefeeds ||
  1893  		m.settings.Version.IsActive(ctx, clusterversion.VersionRangefeedLeases)
  1894  	if useRangefeeds {
  1895  		m.watchForRangefeedUpdates(ctx, s, db, tableUpdateCh)
  1896  		return
  1897  	}
  1898  	gossipCtx, cancelWatchingGossip := context.WithCancel(ctx)
  1899  	m.watchForGossipUpdates(gossipCtx, s, g, tableUpdateCh)
  1900  	canUseRangefeedsCh := m.waitForRangefeedsToBeUsable(ctx, s)
  1901  	if err := s.RunAsyncTask(ctx, "wait for upgrade", func(ctx context.Context) {
  1902  		select {
  1903  		case <-s.ShouldQuiesce():
  1904  			return
  1905  		case <-canUseRangefeedsCh:
  1906  			// Note: It's okay that the cancelation of gossip watching is
  1907  			// asynchronous. At worst we'd get duplicate updates or stale updates.
  1908  			// Both of those are handled.
  1909  			cancelWatchingGossip()
  1910  			// Note: It's safe to start watching for rangefeeds now. We know that all
  1911  			// nodes support rangefeeds in the system config span. Even though there
  1912  			// may not have been logical ops for all operations in the log, the
  1913  			// catch-up scan should take us up to the present.
  1914  			//
  1915  			// When the rangefeed starts up we'll pass it an initial timestamp which
  1916  			// is no newer than all updates to the system config span we've already
  1917  			// seen (see setResolvedTimestamp and its callers). The rangefeed API
  1918  			// ensures that we will see all updates from on or before that timestamp
  1919  			// at least once.
  1920  			m.watchForRangefeedUpdates(ctx, s, db, tableUpdateCh)
  1921  		}
  1922  	}); err != nil {
  1923  		// Note: this can only happen if the stopper has been stopped.
  1924  		return
  1925  	}
  1926  }
  1927  
  1928  func (m *Manager) watchForGossipUpdates(
  1929  	ctx context.Context,
  1930  	s *stop.Stopper,
  1931  	g gossip.DeprecatedGossip,
  1932  	tableUpdateCh chan<- *sqlbase.TableDescriptor,
  1933  ) {
  1934  	if _, err := g.OptionalErr(47150); err != nil {
  1935  		log.Fatalf(ctx, "required gossip until %v is active: %v", clusterversion.VersionRangefeedLeases, err)
  1936  	}
  1937  
  1938  	s.RunWorker(ctx, func(ctx context.Context) {
  1939  		descKeyPrefix := m.codec.TablePrefix(uint32(sqlbase.DescriptorTable.ID))
  1940  		// TODO(ajwerner): Add a mechanism to unregister this channel upon return.
  1941  		gossipUpdateC := g.DeprecatedRegisterSystemConfigChannel(47150)
  1942  		filter := gossip.MakeSystemConfigDeltaFilter(descKeyPrefix)
  1943  
  1944  		ctx, cancel := s.WithCancelOnQuiesce(ctx)
  1945  		defer cancel()
  1946  		for {
  1947  			select {
  1948  			case <-gossipUpdateC:
  1949  				m.handleUpdatedSystemCfg(ctx, g, &filter, tableUpdateCh)
  1950  			case <-s.ShouldQuiesce():
  1951  				return
  1952  			}
  1953  		}
  1954  	})
  1955  }
  1956  
  1957  func (m *Manager) watchForRangefeedUpdates(
  1958  	ctx context.Context, s *stop.Stopper, db *kv.DB, tableUpdateCh chan<- *sqlbase.TableDescriptor,
  1959  ) {
  1960  	if log.V(1) {
  1961  		log.Infof(ctx, "using rangefeeds for lease manager updates")
  1962  	}
  1963  	distSender := db.NonTransactionalSender().(*kv.CrossRangeTxnWrapperSender).Wrapped().(*kvcoord.DistSender)
  1964  	eventCh := make(chan *roachpb.RangeFeedEvent)
  1965  	ctx, _ = s.WithCancelOnQuiesce(ctx)
  1966  	if err := s.RunAsyncTask(ctx, "lease rangefeed", func(ctx context.Context) {
  1967  		for {
  1968  			ts := m.getResolvedTimestamp()
  1969  			descKeyPrefix := m.codec.TablePrefix(uint32(sqlbase.DescriptorTable.ID))
  1970  			span := roachpb.Span{
  1971  				Key:    descKeyPrefix,
  1972  				EndKey: descKeyPrefix.PrefixEnd(),
  1973  			}
  1974  			// Note: We don't need to use withDiff to detect version changes because
  1975  			// the Manager already stores the relevant version information.
  1976  			const withDiff = false
  1977  			log.VEventf(ctx, 1, "starting rangefeed from %v on %v", ts, span)
  1978  			err := distSender.RangeFeed(ctx, span, ts, withDiff, eventCh)
  1979  			if err != nil && ctx.Err() == nil {
  1980  				log.Warningf(ctx, "lease rangefeed failed, restarting: %v", err)
  1981  			}
  1982  			if ctx.Err() != nil {
  1983  				log.VEventf(ctx, 1, "exiting rangefeed")
  1984  				return
  1985  			}
  1986  		}
  1987  	}); err != nil {
  1988  		// This will only fail if the stopper has been stopped.
  1989  		return
  1990  	}
  1991  	handleEvent := func(ev *roachpb.RangeFeedValue) {
  1992  		if len(ev.Value.RawBytes) == 0 {
  1993  			return
  1994  		}
  1995  		var descriptor sqlbase.Descriptor
  1996  		if err := ev.Value.GetProto(&descriptor); err != nil {
  1997  			log.ReportOrPanic(ctx, &m.settings.SV,
  1998  				"%s: unable to unmarshal descriptor %v", ev.Key, ev.Value)
  1999  			return
  2000  		}
  2001  		table := descriptor.Table(ev.Value.Timestamp)
  2002  		if table == nil {
  2003  			return
  2004  		}
  2005  
  2006  		// Note that we don't need to "fill in" the descriptor here. Nobody
  2007  		// actually reads the table, but it's necessary for the call to
  2008  		// ValidateTable().
  2009  		if err := table.MaybeFillInDescriptor(ctx, nil, m.codec); err != nil {
  2010  			log.ReportOrPanic(ctx, &m.settings.SV,
  2011  				"%s: unable to fill in table descriptor %v", ev.Key, table)
  2012  			return
  2013  		}
  2014  		if err := table.ValidateTable(); err != nil {
  2015  			// Note: we don't ReportOrPanic here because invalid descriptors are
  2016  			// sometimes created during testing.
  2017  			log.Errorf(ctx, "%s: received invalid table descriptor: %s. Desc: %v", ev.Key, err, table)
  2018  			return
  2019  		}
  2020  		if log.V(2) {
  2021  			log.Infof(ctx, "%s: refreshing lease table: %d (%s), version: %d, dropped: %t",
  2022  				ev.Key, table.ID, table.Name, table.Version, table.Dropped())
  2023  		}
  2024  		select {
  2025  		case <-ctx.Done():
  2026  		case tableUpdateCh <- table:
  2027  		}
  2028  	}
  2029  	s.RunWorker(ctx, func(ctx context.Context) {
  2030  		for {
  2031  			select {
  2032  			case <-ctx.Done():
  2033  				return
  2034  			case e := <-eventCh:
  2035  				if e.Checkpoint != nil {
  2036  					log.VEventf(ctx, 2, "got rangefeed checkpoint %v", e.Checkpoint)
  2037  					m.setResolvedTimestamp(e.Checkpoint.ResolvedTS)
  2038  					continue
  2039  				}
  2040  				if e.Error != nil {
  2041  					log.Warningf(ctx, "got an error from a rangefeed: %v", e.Error.Error)
  2042  					continue
  2043  				}
  2044  				if e.Val != nil {
  2045  					handleEvent(e.Val)
  2046  				}
  2047  			}
  2048  		}
  2049  	})
  2050  }
  2051  
  2052  func (m *Manager) handleUpdatedSystemCfg(
  2053  	ctx context.Context,
  2054  	g gossip.DeprecatedGossip,
  2055  	cfgFilter *gossip.SystemConfigDeltaFilter,
  2056  	tableUpdateChan chan<- *sqlbase.TableDescriptor,
  2057  ) {
  2058  	cfg := g.DeprecatedSystemConfig(47150)
  2059  	// Read all tables and their versions
  2060  	if log.V(2) {
  2061  		log.Info(ctx, "received a new config; will refresh leases")
  2062  	}
  2063  	var latestTimestamp hlc.Timestamp
  2064  	cfgFilter.ForModified(cfg, func(kv roachpb.KeyValue) {
  2065  		// Attempt to unmarshal config into a table/database descriptor.
  2066  		var descriptor sqlbase.Descriptor
  2067  		if latestTimestamp.Less(kv.Value.Timestamp) {
  2068  			latestTimestamp = kv.Value.Timestamp
  2069  		}
  2070  		if err := kv.Value.GetProto(&descriptor); err != nil {
  2071  			log.Warningf(ctx, "%s: unable to unmarshal descriptor %v", kv.Key, kv.Value)
  2072  			return
  2073  		}
  2074  		switch union := descriptor.Union.(type) {
  2075  		case *sqlbase.Descriptor_Table:
  2076  			table := union.Table
  2077  			// Note that we don't need to "fill in" the descriptor here. Nobody
  2078  			// actually reads the table, but it's necessary for the call to
  2079  			// ValidateTable().
  2080  			if err := table.MaybeFillInDescriptor(ctx, nil, m.codec); err != nil {
  2081  				log.Warningf(ctx, "%s: unable to fill in table descriptor %v", kv.Key, table)
  2082  				return
  2083  			}
  2084  			if err := table.ValidateTable(); err != nil {
  2085  				log.Errorf(ctx, "%s: received invalid table descriptor: %s. Desc: %v",
  2086  					kv.Key, err, table,
  2087  				)
  2088  				return
  2089  			}
  2090  			if log.V(2) {
  2091  				log.Infof(ctx, "%s: refreshing lease table: %d (%s), version: %d, dropped: %t",
  2092  					kv.Key, table.ID, table.Name, table.Version, table.Dropped())
  2093  			}
  2094  			select {
  2095  			case <-ctx.Done():
  2096  			case tableUpdateChan <- table:
  2097  			}
  2098  
  2099  		case *sqlbase.Descriptor_Database:
  2100  			// Ignore.
  2101  		}
  2102  	})
  2103  	if !latestTimestamp.IsEmpty() {
  2104  		m.setResolvedTimestamp(latestTimestamp)
  2105  	}
  2106  	// Attempt to shove a nil table descriptor into the channel to ensure that
  2107  	// we've processed all of the events previously sent.
  2108  	select {
  2109  	case <-ctx.Done():
  2110  		// If we've been canceled, the other size of the channel will also have
  2111  		// been canceled.
  2112  	case tableUpdateChan <- nil:
  2113  	}
  2114  }
  2115  
  2116  // waitForRangefeedsToBeUsable returns a channel which is closed when rangefeeds
  2117  // are usable according to the cluster version.
  2118  func (m *Manager) waitForRangefeedsToBeUsable(ctx context.Context, s *stop.Stopper) chan struct{} {
  2119  	// TODO(ajwerner): Add a callback to notify about version changes.
  2120  	// Checking is pretty cheap but really this should be a callback.
  2121  	const defaultCheckInterval = 10 * time.Second
  2122  	checkInterval := defaultCheckInterval
  2123  	if m.testingKnobs.VersionPollIntervalForRangefeeds != 0 {
  2124  		checkInterval = m.testingKnobs.VersionPollIntervalForRangefeeds
  2125  	}
  2126  	upgradeChan := make(chan struct{})
  2127  	timer := timeutil.NewTimer()
  2128  	timer.Reset(0)
  2129  	s.RunWorker(ctx, func(ctx context.Context) {
  2130  		for {
  2131  			select {
  2132  			case <-timer.C:
  2133  				timer.Read = true
  2134  				if m.settings.Version.IsActive(ctx, clusterversion.VersionRangefeedLeases) {
  2135  					close(upgradeChan)
  2136  					return
  2137  				}
  2138  				timer.Reset(checkInterval)
  2139  			case <-ctx.Done():
  2140  				return
  2141  			case <-s.ShouldQuiesce():
  2142  				return
  2143  			}
  2144  		}
  2145  	})
  2146  	return upgradeChan
  2147  }
  2148  
  2149  // setResolvedTimestamp marks the Manager as having processed all updates
  2150  // up to this timestamp. It is set under the gossip path based on the highest
  2151  // timestamp seen in a system config and under the rangefeed path when a
  2152  // resolved timestamp is received.
  2153  func (m *Manager) setResolvedTimestamp(ts hlc.Timestamp) {
  2154  	m.mu.Lock()
  2155  	defer m.mu.Unlock()
  2156  	if m.mu.updatesResolvedTimestamp.Less(ts) {
  2157  		m.mu.updatesResolvedTimestamp = ts
  2158  	}
  2159  }
  2160  
  2161  func (m *Manager) getResolvedTimestamp() hlc.Timestamp {
  2162  	m.mu.Lock()
  2163  	defer m.mu.Unlock()
  2164  	return m.mu.updatesResolvedTimestamp
  2165  }
  2166  
  2167  // tableLeaseRefreshLimit is the upper-limit on the number of table leases
  2168  // that will continuously have their lease refreshed.
  2169  var tableLeaseRefreshLimit = settings.RegisterIntSetting(
  2170  	"sql.tablecache.lease.refresh_limit",
  2171  	"maximum number of tables to periodically refresh leases for",
  2172  	50,
  2173  )
  2174  
  2175  // PeriodicallyRefreshSomeLeases so that leases are fresh and can serve
  2176  // traffic immediately.
  2177  // TODO(vivek): Remove once epoch based table leases are implemented.
  2178  func (m *Manager) PeriodicallyRefreshSomeLeases(ctx context.Context) {
  2179  	m.stopper.RunWorker(ctx, func(ctx context.Context) {
  2180  		if m.leaseDuration <= 0 {
  2181  			return
  2182  		}
  2183  		refreshTimer := timeutil.NewTimer()
  2184  		defer refreshTimer.Stop()
  2185  		refreshTimer.Reset(m.Storage.jitteredLeaseDuration() / 2)
  2186  		for {
  2187  			select {
  2188  			case <-m.stopper.ShouldQuiesce():
  2189  				return
  2190  
  2191  			case <-refreshTimer.C:
  2192  				refreshTimer.Read = true
  2193  				refreshTimer.Reset(m.Storage.jitteredLeaseDuration() / 2)
  2194  
  2195  				m.refreshSomeLeases(ctx)
  2196  			}
  2197  		}
  2198  	})
  2199  }
  2200  
  2201  // Refresh some of the current leases.
  2202  func (m *Manager) refreshSomeLeases(ctx context.Context) {
  2203  	limit := tableLeaseRefreshLimit.Get(&m.settings.SV)
  2204  	if limit <= 0 {
  2205  		return
  2206  	}
  2207  	// Construct a list of tables needing their leases to be reacquired.
  2208  	m.mu.Lock()
  2209  	ids := make([]sqlbase.ID, 0, len(m.mu.tables))
  2210  	var i int64
  2211  	for k, table := range m.mu.tables {
  2212  		if i++; i > limit {
  2213  			break
  2214  		}
  2215  		table.mu.Lock()
  2216  		dropped := table.mu.dropped
  2217  		table.mu.Unlock()
  2218  		if !dropped {
  2219  			ids = append(ids, k)
  2220  		}
  2221  	}
  2222  	m.mu.Unlock()
  2223  	// Limit the number of concurrent lease refreshes.
  2224  	var wg sync.WaitGroup
  2225  	for i := range ids {
  2226  		id := ids[i]
  2227  		wg.Add(1)
  2228  		if err := m.stopper.RunLimitedAsyncTask(
  2229  			ctx, fmt.Sprintf("refresh table:%d lease", id), m.sem, true /*wait*/, func(ctx context.Context) {
  2230  				defer wg.Done()
  2231  				if _, err := acquireNodeLease(ctx, m, id); err != nil {
  2232  					log.Infof(ctx, "refreshing table: %d lease failed: %s", id, err)
  2233  				}
  2234  			}); err != nil {
  2235  			log.Infof(ctx, "didnt refresh table: %d lease: %s", id, err)
  2236  			wg.Done()
  2237  		}
  2238  	}
  2239  	wg.Wait()
  2240  }
  2241  
  2242  // DeleteOrphanedLeases releases all orphaned leases created by a prior
  2243  // instance of this node. timeThreshold is a walltime lower than the
  2244  // lowest hlc timestamp that the current instance of the node can use.
  2245  func (m *Manager) DeleteOrphanedLeases(timeThreshold int64) {
  2246  	if m.testingKnobs.DisableDeleteOrphanedLeases {
  2247  		return
  2248  	}
  2249  	// TODO(asubiotto): clear up the nodeID naming here and in the table below,
  2250  	// tracked as https://github.com/cockroachdb/cockroach/issues/48271.
  2251  	nodeID := m.Storage.nodeIDContainer.SQLInstanceID()
  2252  	if nodeID == 0 {
  2253  		panic("zero nodeID")
  2254  	}
  2255  
  2256  	// Run as async worker to prevent blocking the main server Start method.
  2257  	// Exit after releasing all the orphaned leases.
  2258  	m.stopper.RunWorker(context.Background(), func(ctx context.Context) {
  2259  		// This could have been implemented using DELETE WHERE, but DELETE WHERE
  2260  		// doesn't implement AS OF SYSTEM TIME.
  2261  
  2262  		// Read orphaned leases.
  2263  		sqlQuery := fmt.Sprintf(`
  2264  SELECT "descID", version, expiration FROM system.public.lease AS OF SYSTEM TIME %d WHERE "nodeID" = %d
  2265  `, timeThreshold, nodeID)
  2266  		var rows []tree.Datums
  2267  		retryOptions := base.DefaultRetryOptions()
  2268  		retryOptions.Closer = m.stopper.ShouldQuiesce()
  2269  		// The retry is required because of errors caused by node restarts. Retry 30 times.
  2270  		if err := retry.WithMaxAttempts(ctx, retryOptions, 30, func() error {
  2271  			var err error
  2272  			rows, err = m.Storage.internalExecutor.Query(
  2273  				ctx, "read orphaned table leases", nil /*txn*/, sqlQuery)
  2274  			return err
  2275  		}); err != nil {
  2276  			log.Warningf(ctx, "unable to read orphaned leases: %+v", err)
  2277  			return
  2278  		}
  2279  
  2280  		var wg sync.WaitGroup
  2281  		defer wg.Wait()
  2282  		for i := range rows {
  2283  			// Early exit?
  2284  			row := rows[i]
  2285  			wg.Add(1)
  2286  			lease := storedTableLease{
  2287  				id:         sqlbase.ID(tree.MustBeDInt(row[0])),
  2288  				version:    int(tree.MustBeDInt(row[1])),
  2289  				expiration: tree.MustBeDTimestamp(row[2]),
  2290  			}
  2291  			if err := m.stopper.RunLimitedAsyncTask(
  2292  				ctx, fmt.Sprintf("release table lease %+v", lease), m.sem, true /*wait*/, func(ctx context.Context) {
  2293  					m.Storage.release(ctx, m.stopper, &lease)
  2294  					log.Infof(ctx, "released orphaned table lease: %+v", lease)
  2295  					wg.Done()
  2296  				}); err != nil {
  2297  				log.Warningf(ctx, "did not release orphaned table lease: %+v, err = %s", lease, err)
  2298  				wg.Done()
  2299  			}
  2300  		}
  2301  	})
  2302  }
  2303  
  2304  // DB returns the Manager's handle to a kv.DB.
  2305  func (m *Manager) DB() *kv.DB {
  2306  	return m.db
  2307  }
  2308  
  2309  // Codec return the Manager's SQLCodec.
  2310  func (m *Manager) Codec() keys.SQLCodec {
  2311  	return m.codec
  2312  }
  2313  
  2314  // VisitLeases introspects the state of leases managed by the Manager.
  2315  //
  2316  // TODO(ajwerner): consider refactoring the function to take a struct, maybe
  2317  // called LeaseInfo.
  2318  func (m *Manager) VisitLeases(
  2319  	f func(desc sqlbase.TableDescriptor, dropped bool, refCount int, expiration tree.DTimestamp) (wantMore bool),
  2320  ) {
  2321  	m.mu.Lock()
  2322  	defer m.mu.Unlock()
  2323  	for _, ts := range m.mu.tables {
  2324  		tableVisitor := func() (wantMore bool) {
  2325  			ts.mu.Lock()
  2326  			defer ts.mu.Unlock()
  2327  
  2328  			dropped := ts.mu.dropped
  2329  
  2330  			for _, state := range ts.mu.active.data {
  2331  				state.mu.Lock()
  2332  				lease := state.mu.lease
  2333  				refCount := state.mu.refcount
  2334  				state.mu.Unlock()
  2335  
  2336  				if lease == nil {
  2337  					continue
  2338  				}
  2339  
  2340  				if !f(state.TableDescriptor, dropped, refCount, lease.expiration) {
  2341  					return false
  2342  				}
  2343  			}
  2344  			return true
  2345  		}
  2346  		if !tableVisitor() {
  2347  			return
  2348  		}
  2349  	}
  2350  }
  2351  
  2352  // TestingAcquireAndAssertMinVersion acquires a read lease for the specified
  2353  // table ID. The lease is grabbed on the latest version if >= specified version.
  2354  // It returns a table descriptor and an expiration time valid for the timestamp.
  2355  // This method is useful for testing and is only intended to be used in that
  2356  // context.
  2357  func (m *Manager) TestingAcquireAndAssertMinVersion(
  2358  	ctx context.Context,
  2359  	timestamp hlc.Timestamp,
  2360  	tableID sqlbase.ID,
  2361  	minVersion sqlbase.DescriptorVersion,
  2362  ) (*sqlbase.ImmutableTableDescriptor, hlc.Timestamp, error) {
  2363  	t := m.findTableState(tableID, true)
  2364  	if err := ensureVersion(ctx, tableID, minVersion, m); err != nil {
  2365  		return nil, hlc.Timestamp{}, err
  2366  	}
  2367  	table, _, err := t.findForTimestamp(ctx, timestamp)
  2368  	if err != nil {
  2369  		return nil, hlc.Timestamp{}, err
  2370  	}
  2371  	return &table.ImmutableTableDescriptor, table.expiration, nil
  2372  }