github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/catalog/lease/lease.go (about) 1 // Copyright 2015 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 // Package lease provides functionality to create and manage sql schema leases. 12 package lease 13 14 import ( 15 "bytes" 16 "context" 17 "fmt" 18 "math/rand" 19 "sort" 20 "strings" 21 "sync" 22 "sync/atomic" 23 "time" 24 25 "github.com/cockroachdb/cockroach/pkg/base" 26 "github.com/cockroachdb/cockroach/pkg/clusterversion" 27 "github.com/cockroachdb/cockroach/pkg/gossip" 28 "github.com/cockroachdb/cockroach/pkg/keys" 29 "github.com/cockroachdb/cockroach/pkg/kv" 30 "github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord" 31 "github.com/cockroachdb/cockroach/pkg/roachpb" 32 "github.com/cockroachdb/cockroach/pkg/security" 33 "github.com/cockroachdb/cockroach/pkg/settings" 34 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 35 "github.com/cockroachdb/cockroach/pkg/sql/catalog/catalogkv" 36 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 37 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 38 "github.com/cockroachdb/cockroach/pkg/sql/sqlutil" 39 "github.com/cockroachdb/cockroach/pkg/util/hlc" 40 "github.com/cockroachdb/cockroach/pkg/util/log" 41 "github.com/cockroachdb/cockroach/pkg/util/quotapool" 42 "github.com/cockroachdb/cockroach/pkg/util/retry" 43 "github.com/cockroachdb/cockroach/pkg/util/stop" 44 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 45 "github.com/cockroachdb/cockroach/pkg/util/syncutil/singleflight" 46 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 47 "github.com/cockroachdb/cockroach/pkg/util/tracing" 48 "github.com/cockroachdb/errors" 49 "github.com/cockroachdb/logtags" 50 ) 51 52 var errRenewLease = errors.New("renew lease on id") 53 var errReadOlderTableVersion = errors.New("read older table version from store") 54 55 // A lease stored in system.lease. 56 type storedTableLease struct { 57 id sqlbase.ID 58 version int 59 expiration tree.DTimestamp 60 } 61 62 // tableVersionState holds the state for a table version. This includes 63 // the lease information for a table version. 64 // TODO(vivek): A node only needs to manage lease information on what it 65 // thinks is the latest version for a table descriptor. 66 type tableVersionState struct { 67 // This descriptor is immutable and can be shared by many goroutines. 68 // Care must be taken to not modify it. 69 sqlbase.ImmutableTableDescriptor 70 71 // The expiration time for the table version. A transaction with 72 // timestamp T can use this table descriptor version iff 73 // TableDescriptor.ModificationTime <= T < expiration 74 // 75 // The expiration time is either the expiration time of the lease 76 // when a lease is associated with the table version, or the 77 // ModificationTime of the next version when the table version 78 // isn't associated with a lease. 79 expiration hlc.Timestamp 80 81 mu struct { 82 syncutil.Mutex 83 84 refcount int 85 // Set if the node has a lease on this descriptor version. 86 // Leases can only be held for the two latest versions of 87 // a table descriptor. The latest version known to a node 88 // (can be different than the current latest version in the store) 89 // is always associated with a lease. The previous version known to 90 // a node might not necessarily be associated with a lease. 91 lease *storedTableLease 92 } 93 } 94 95 func (s *tableVersionState) String() string { 96 s.mu.Lock() 97 defer s.mu.Unlock() 98 return s.stringLocked() 99 } 100 101 // stringLocked reads mu.refcount and thus needs to have mu held. 102 func (s *tableVersionState) stringLocked() string { 103 return fmt.Sprintf("%d(%q) ver=%d:%s, refcount=%d", s.ID, s.Name, s.Version, s.expiration, s.mu.refcount) 104 } 105 106 // hasExpired checks if the table is too old to be used (by a txn operating) 107 // at the given timestamp 108 func (s *tableVersionState) hasExpired(timestamp hlc.Timestamp) bool { 109 return s.expiration.LessEq(timestamp) 110 } 111 112 // hasValidExpiration checks that this table have a larger expiration than 113 // the existing one it is replacing. This can be used to check the 114 // monotonicity of the expiration times on a table at a particular version. 115 // The version is not explicitly checked here. 116 func (s *tableVersionState) hasValidExpiration(existing *tableVersionState) bool { 117 return existing.expiration.Less(s.expiration) 118 } 119 120 func (s *tableVersionState) incRefcount() { 121 s.mu.Lock() 122 s.incRefcountLocked() 123 s.mu.Unlock() 124 } 125 126 func (s *tableVersionState) incRefcountLocked() { 127 s.mu.refcount++ 128 if log.V(2) { 129 log.VEventf(context.TODO(), 2, "tableVersionState.incRef: %s", s.stringLocked()) 130 } 131 } 132 133 // The lease expiration stored in the database is of a different type. 134 // We've decided that it's too much work to change the type to 135 // hlc.Timestamp, so we're using this method to give us the stored 136 // type: tree.DTimestamp. 137 func storedLeaseExpiration(expiration hlc.Timestamp) tree.DTimestamp { 138 return tree.DTimestamp{Time: timeutil.Unix(0, expiration.WallTime).Round(time.Microsecond)} 139 } 140 141 // Storage implements the operations for acquiring and releasing leases and 142 // publishing a new version of a descriptor. Exported only for testing. 143 type Storage struct { 144 nodeIDContainer *base.SQLIDContainer 145 db *kv.DB 146 clock *hlc.Clock 147 internalExecutor sqlutil.InternalExecutor 148 settings *cluster.Settings 149 codec keys.SQLCodec 150 151 // group is used for all calls made to acquireNodeLease to prevent 152 // concurrent lease acquisitions from the store. 153 group *singleflight.Group 154 155 // leaseDuration is the mean duration a lease will be acquired for. The 156 // actual duration is jittered using leaseJitterFraction. Jittering is done to 157 // prevent multiple leases from being renewed simultaneously if they were all 158 // acquired simultaneously. 159 leaseDuration time.Duration 160 // leaseJitterFraction is the factor that we use to randomly jitter the lease 161 // duration when acquiring a new lease and the lease renewal timeout. The 162 // range of the actual lease duration will be 163 // [(1-leaseJitterFraction) * leaseDuration, (1+leaseJitterFraction) * leaseDuration] 164 leaseJitterFraction float64 165 // leaseRenewalTimeout is the time before a lease expires when 166 // acquisition to renew the lease begins. 167 leaseRenewalTimeout time.Duration 168 169 testingKnobs StorageTestingKnobs 170 } 171 172 // jitteredLeaseDuration returns a randomly jittered duration from the interval 173 // [(1-leaseJitterFraction) * leaseDuration, (1+leaseJitterFraction) * leaseDuration]. 174 func (s Storage) jitteredLeaseDuration() time.Duration { 175 return time.Duration(float64(s.leaseDuration) * (1 - s.leaseJitterFraction + 176 2*s.leaseJitterFraction*rand.Float64())) 177 } 178 179 // acquire a lease on the most recent version of a table descriptor. 180 // If the lease cannot be obtained because the descriptor is in the process of 181 // being dropped or offline, the error will be of type inactiveTableError. 182 // The expiration time set for the lease > minExpiration. 183 func (s Storage) acquire( 184 ctx context.Context, minExpiration hlc.Timestamp, tableID sqlbase.ID, 185 ) (*tableVersionState, error) { 186 var table *tableVersionState 187 err := s.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 188 // Run the descriptor read as high-priority, thereby pushing any intents out 189 // of its way. We don't want schema changes to prevent lease acquisitions; 190 // we'd rather force them to refresh. Also this prevents deadlocks in cases 191 // where the name resolution is triggered by the transaction doing the 192 // schema change itself. 193 if err := txn.SetUserPriority(roachpb.MaxUserPriority); err != nil { 194 return err 195 } 196 expiration := txn.ReadTimestamp() 197 expiration.WallTime += int64(s.jitteredLeaseDuration()) 198 if expiration.LessEq(minExpiration) { 199 // In the rare circumstances where expiration <= minExpiration 200 // use an expiration based on the minExpiration to guarantee 201 // a monotonically increasing expiration. 202 expiration = minExpiration.Add(int64(time.Millisecond), 0) 203 } 204 205 tableDesc, err := sqlbase.GetTableDescFromID(ctx, txn, s.codec, tableID) 206 if err != nil { 207 return err 208 } 209 if err := sqlbase.FilterTableState(tableDesc); err != nil { 210 return err 211 } 212 if err := tableDesc.MaybeFillInDescriptor(ctx, txn, s.codec); err != nil { 213 return err 214 } 215 // Once the descriptor is set it is immutable and care must be taken 216 // to not modify it. 217 storedLease := &storedTableLease{ 218 id: tableDesc.ID, 219 version: int(tableDesc.Version), 220 expiration: storedLeaseExpiration(expiration), 221 } 222 table = &tableVersionState{ 223 ImmutableTableDescriptor: *sqlbase.NewImmutableTableDescriptor(*tableDesc), 224 expiration: expiration, 225 } 226 log.VEventf(ctx, 2, "Storage acquired lease %+v", storedLease) 227 table.mu.lease = storedLease 228 229 // ValidateTable instead of Validate, even though we have a txn available, 230 // so we don't block reads waiting for this table version. 231 if err := table.ValidateTable(); err != nil { 232 return err 233 } 234 235 nodeID := s.nodeIDContainer.SQLInstanceID() 236 if nodeID == 0 { 237 panic("zero nodeID") 238 } 239 240 // We use string interpolation here, instead of passing the arguments to 241 // InternalExecutor.Exec() because we don't want to pay for preparing the 242 // statement (which would happen if we'd pass arguments). Besides the 243 // general cost of preparing, preparing this statement always requires a 244 // read from the database for the special descriptor of a system table 245 // (#23937). 246 insertLease := fmt.Sprintf( 247 `INSERT INTO system.public.lease ("descID", version, "nodeID", expiration) VALUES (%d, %d, %d, %s)`, 248 storedLease.id, storedLease.version, nodeID, &storedLease.expiration, 249 ) 250 count, err := s.internalExecutor.Exec(ctx, "lease-insert", txn, insertLease) 251 if err != nil { 252 return err 253 } 254 if count != 1 { 255 return errors.Errorf("%s: expected 1 result, found %d", insertLease, count) 256 } 257 return nil 258 }) 259 if err == nil && s.testingKnobs.LeaseAcquiredEvent != nil { 260 s.testingKnobs.LeaseAcquiredEvent(table.TableDescriptor, nil) 261 } 262 return table, err 263 } 264 265 // Release a previously acquired table descriptor. Never let this method 266 // read a table descriptor because it can be called while modifying a 267 // descriptor through a schema change before the schema change has committed 268 // that can result in a deadlock. 269 func (s Storage) release(ctx context.Context, stopper *stop.Stopper, lease *storedTableLease) { 270 retryOptions := base.DefaultRetryOptions() 271 retryOptions.Closer = stopper.ShouldQuiesce() 272 firstAttempt := true 273 // This transaction is idempotent; the retry was put in place because of 274 // NodeUnavailableErrors. 275 for r := retry.Start(retryOptions); r.Next(); { 276 log.VEventf(ctx, 2, "Storage releasing lease %+v", lease) 277 nodeID := s.nodeIDContainer.SQLInstanceID() 278 if nodeID == 0 { 279 panic("zero nodeID") 280 } 281 const deleteLease = `DELETE FROM system.public.lease ` + 282 `WHERE ("descID", version, "nodeID", expiration) = ($1, $2, $3, $4)` 283 count, err := s.internalExecutor.Exec( 284 ctx, 285 "lease-release", 286 nil, /* txn */ 287 deleteLease, 288 lease.id, lease.version, nodeID, &lease.expiration, 289 ) 290 if err != nil { 291 log.Warningf(ctx, "error releasing lease %q: %s", lease, err) 292 firstAttempt = false 293 continue 294 } 295 // We allow count == 0 after the first attempt. 296 if count > 1 || (count == 0 && firstAttempt) { 297 log.Warningf(ctx, "unexpected results while deleting lease %+v: "+ 298 "expected 1 result, found %d", lease, count) 299 } 300 301 if s.testingKnobs.LeaseReleasedEvent != nil { 302 s.testingKnobs.LeaseReleasedEvent( 303 lease.id, sqlbase.DescriptorVersion(lease.version), err) 304 } 305 break 306 } 307 } 308 309 // WaitForOneVersion returns once there are no unexpired leases on the 310 // previous version of the table descriptor. It returns the current version. 311 // After returning there can only be versions of the descriptor >= to the 312 // returned version. Lease acquisition (see acquire()) maintains the 313 // invariant that no new leases for desc.Version-1 will be granted once 314 // desc.Version exists. 315 func (s Storage) WaitForOneVersion( 316 ctx context.Context, tableID sqlbase.ID, retryOpts retry.Options, 317 ) (sqlbase.DescriptorVersion, error) { 318 var tableDesc *sqlbase.TableDescriptor 319 var err error 320 for lastCount, r := 0, retry.Start(retryOpts); r.Next(); { 321 // Get the current version of the table descriptor non-transactionally. 322 // 323 // TODO(pmattis): Do an inconsistent read here? 324 tableDesc, err = sqlbase.GetTableDescFromID(ctx, s.db, s.codec, tableID) 325 if err != nil { 326 return 0, err 327 } 328 // Check to see if there are any leases that still exist on the previous 329 // version of the descriptor. 330 now := s.clock.Now() 331 tables := []IDVersion{NewIDVersionPrev(tableDesc)} 332 count, err := CountLeases(ctx, s.internalExecutor, tables, now) 333 if err != nil { 334 return 0, err 335 } 336 if count == 0 { 337 break 338 } 339 if count != lastCount { 340 lastCount = count 341 log.Infof(ctx, "waiting for %d leases to expire: desc=%v", count, tables) 342 } 343 } 344 return tableDesc.Version, nil 345 } 346 347 // ErrDidntUpdateDescriptor can be returned from the update function passed to 348 // PublishMultiple to suppress an error being returned and return the original 349 // values. 350 var ErrDidntUpdateDescriptor = errors.New("didn't update the table descriptor") 351 352 // PublishMultiple updates multiple table descriptors, maintaining the invariant 353 // that there are at most two versions of each descriptor out in the wild at any 354 // time by first waiting for all nodes to be on the current (pre-update) version 355 // of the table desc. 356 // 357 // The update closure for all tables is called after the wait. The map argument 358 // is a map of the table descriptors with the IDs given in tableIDs, and the 359 // closure mutates those descriptors. The txn argument closure is intended to be 360 // used for updating jobs. Note that it can't be used for anything except 361 // writing to system tables, since we set the system config trigger to write the 362 // schema changes. 363 // 364 // The closure may be called multiple times if retries occur; make sure it does 365 // not have side effects. 366 // 367 // Returns the updated versions of the descriptors. 368 // 369 // TODO (lucy): Providing the txn for the update closure just to update a job 370 // is not ideal. There must be a better API for this. 371 func (s Storage) PublishMultiple( 372 ctx context.Context, 373 tableIDs []sqlbase.ID, 374 update func(*kv.Txn, map[sqlbase.ID]*sqlbase.MutableTableDescriptor) error, 375 logEvent func(*kv.Txn) error, 376 ) (map[sqlbase.ID]*sqlbase.ImmutableTableDescriptor, error) { 377 errLeaseVersionChanged := errors.New("lease version changed") 378 // Retry while getting errLeaseVersionChanged. 379 for r := retry.Start(base.DefaultRetryOptions()); r.Next(); { 380 // Wait until there are no unexpired leases on the previous versions 381 // of the tables. 382 expectedVersions := make(map[sqlbase.ID]sqlbase.DescriptorVersion) 383 for _, id := range tableIDs { 384 expected, err := s.WaitForOneVersion(ctx, id, base.DefaultRetryOptions()) 385 if err != nil { 386 return nil, err 387 } 388 expectedVersions[id] = expected 389 } 390 391 tableDescs := make(map[sqlbase.ID]*sqlbase.MutableTableDescriptor) 392 // There should be only one version of the descriptor, but it's 393 // a race now to update to the next version. 394 err := s.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 395 versions := make(map[sqlbase.ID]sqlbase.DescriptorVersion) 396 descsToUpdate := make(map[sqlbase.ID]*sqlbase.MutableTableDescriptor) 397 for _, id := range tableIDs { 398 // Re-read the current versions of the table descriptor, this time 399 // transactionally. 400 var err error 401 descsToUpdate[id], err = sqlbase.GetMutableTableDescFromID(ctx, txn, s.codec, id) 402 if err != nil { 403 return err 404 } 405 406 if expectedVersions[id] != descsToUpdate[id].Version { 407 // The version changed out from under us. Someone else must be 408 // performing a schema change operation. 409 if log.V(3) { 410 log.Infof(ctx, "publish (version changed): %d != %d", expectedVersions[id], descsToUpdate[id].Version) 411 } 412 return errLeaseVersionChanged 413 } 414 415 versions[id] = descsToUpdate[id].Version 416 } 417 418 // This is to write the updated descriptors. 419 if err := txn.SetSystemConfigTrigger(); err != nil { 420 return err 421 } 422 423 // Run the update closure. 424 if err := update(txn, descsToUpdate); err != nil { 425 return err 426 } 427 for _, id := range tableIDs { 428 if versions[id] != descsToUpdate[id].Version { 429 return errors.Errorf("updated version to: %d, expected: %d", 430 descsToUpdate[id].Version, versions[id]) 431 } 432 433 if err := descsToUpdate[id].MaybeIncrementVersion(ctx, txn, s.settings); err != nil { 434 return err 435 } 436 if err := descsToUpdate[id].ValidateTable(); err != nil { 437 return err 438 } 439 440 tableDescs[id] = descsToUpdate[id] 441 } 442 443 b := txn.NewBatch() 444 for tableID, tableDesc := range tableDescs { 445 if err := catalogkv.WriteDescToBatch(ctx, false /* kvTrace */, s.settings, b, s.codec, tableID, tableDesc.TableDesc()); err != nil { 446 return err 447 } 448 } 449 if logEvent != nil { 450 // If an event log is required for this update, ensure that the 451 // descriptor change occurs first in the transaction. This is 452 // necessary to ensure that the System configuration change is 453 // gossiped. See the documentation for 454 // transaction.SetSystemConfigTrigger() for more information. 455 if err := txn.Run(ctx, b); err != nil { 456 return err 457 } 458 if err := logEvent(txn); err != nil { 459 return err 460 } 461 return txn.Commit(ctx) 462 } 463 // More efficient batching can be used if no event log message 464 // is required. 465 return txn.CommitInBatch(ctx, b) 466 }) 467 468 switch { 469 case err == nil || errors.Is(err, ErrDidntUpdateDescriptor): 470 immutTableDescs := make(map[sqlbase.ID]*sqlbase.ImmutableTableDescriptor) 471 for id, tableDesc := range tableDescs { 472 immutTableDescs[id] = sqlbase.NewImmutableTableDescriptor(tableDesc.TableDescriptor) 473 } 474 return immutTableDescs, nil 475 case errors.Is(err, errLeaseVersionChanged): 476 // will loop around to retry 477 default: 478 return nil, err 479 } 480 } 481 482 panic("not reached") 483 } 484 485 // Publish updates a table descriptor. It also maintains the invariant that 486 // there are at most two versions of the descriptor out in the wild at any time 487 // by first waiting for all nodes to be on the current (pre-update) version of 488 // the table desc. 489 // 490 // The update closure is called after the wait, and it provides the new version 491 // of the descriptor to be written. In a multi-step schema operation, this 492 // update should perform a single step. 493 // 494 // The closure may be called multiple times if retries occur; make sure it does 495 // not have side effects. 496 // 497 // Returns the updated version of the descriptor. 498 // TODO (lucy): Maybe have the closure take a *kv.Txn to match 499 // PublishMultiple. 500 func (s Storage) Publish( 501 ctx context.Context, 502 tableID sqlbase.ID, 503 update func(*sqlbase.MutableTableDescriptor) error, 504 logEvent func(*kv.Txn) error, 505 ) (*sqlbase.ImmutableTableDescriptor, error) { 506 tableIDs := []sqlbase.ID{tableID} 507 updates := func(_ *kv.Txn, descs map[sqlbase.ID]*sqlbase.MutableTableDescriptor) error { 508 desc, ok := descs[tableID] 509 if !ok { 510 return errors.AssertionFailedf("required table with ID %d not provided to update closure", tableID) 511 } 512 return update(desc) 513 } 514 515 results, err := s.PublishMultiple(ctx, tableIDs, updates, logEvent) 516 if err != nil { 517 return nil, err 518 } 519 return results[tableID], nil 520 } 521 522 // IDVersion represents a descriptor ID, version pair that are 523 // meant to map to a single immutable descriptor. 524 type IDVersion struct { 525 // Name is only provided for pretty printing. 526 Name string 527 ID sqlbase.ID 528 Version sqlbase.DescriptorVersion 529 } 530 531 // NewIDVersionPrev returns an initialized IDVersion with the 532 // previous version of the descriptor. 533 func NewIDVersionPrev(desc *sqlbase.TableDescriptor) IDVersion { 534 return IDVersion{Name: desc.Name, ID: desc.ID, Version: desc.Version - 1} 535 } 536 537 // CountLeases returns the number of unexpired leases for a number of tables 538 // each at a particular version at a particular time. 539 func CountLeases( 540 ctx context.Context, executor sqlutil.InternalExecutor, tables []IDVersion, at hlc.Timestamp, 541 ) (int, error) { 542 var whereClauses []string 543 for _, t := range tables { 544 whereClauses = append(whereClauses, 545 fmt.Sprintf(`("descID" = %d AND version = %d AND expiration > $1)`, 546 t.ID, t.Version), 547 ) 548 } 549 550 stmt := fmt.Sprintf(`SELECT count(1) FROM system.public.lease AS OF SYSTEM TIME %s WHERE `, 551 at.AsOfSystemTime()) + 552 strings.Join(whereClauses, " OR ") 553 values, err := executor.QueryRowEx( 554 ctx, "count-leases", nil, /* txn */ 555 sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser}, 556 stmt, at.GoTime(), 557 ) 558 if err != nil { 559 return 0, err 560 } 561 count := int(tree.MustBeDInt(values[0])) 562 return count, nil 563 } 564 565 // Get the table descriptor valid for the expiration time from the store. 566 // We use a timestamp that is just less than the expiration time to read 567 // a version of the table descriptor. A tableVersionState with the 568 // expiration time set to expiration is returned. 569 // 570 // This returns an error when Replica.checkTSAboveGCThresholdRLocked() 571 // returns an error when the expiration timestamp is less than the storage 572 // layer GC threshold. 573 func (s Storage) getForExpiration( 574 ctx context.Context, expiration hlc.Timestamp, id sqlbase.ID, 575 ) (*tableVersionState, error) { 576 var table *tableVersionState 577 err := s.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 578 descKey := sqlbase.MakeDescMetadataKey(s.codec, id) 579 prevTimestamp := expiration.Prev() 580 txn.SetFixedTimestamp(ctx, prevTimestamp) 581 var desc sqlbase.Descriptor 582 ts, err := txn.GetProtoTs(ctx, descKey, &desc) 583 if err != nil { 584 return err 585 } 586 tableDesc := desc.Table(ts) 587 if tableDesc == nil { 588 return sqlbase.ErrDescriptorNotFound 589 } 590 if prevTimestamp.LessEq(tableDesc.ModificationTime) { 591 return errors.AssertionFailedf("unable to read table= (%d, %s)", id, expiration) 592 } 593 if err := tableDesc.MaybeFillInDescriptor(ctx, txn, s.codec); err != nil { 594 return err 595 } 596 // Create a tableVersionState with the table and without a lease. 597 table = &tableVersionState{ 598 ImmutableTableDescriptor: *sqlbase.NewImmutableTableDescriptor(*tableDesc), 599 expiration: expiration, 600 } 601 return nil 602 }) 603 return table, err 604 } 605 606 // leaseToken is an opaque token representing a lease. It's distinct from a 607 // lease to define restricted capabilities and prevent improper use of a lease 608 // where we instead have leaseTokens. 609 type leaseToken *tableVersionState 610 611 // tableSet maintains an ordered set of tableVersionState objects sorted 612 // by version. It supports addition and removal of elements, finding the 613 // table for a particular version, or finding the most recent table version. 614 // The order is maintained by insert and remove and there can only be a 615 // unique entry for a version. Only the last two versions can be leased, 616 // with the last one being the latest one which is always leased. 617 // 618 // Each entry represents a time span [ModificationTime, expiration) 619 // and can be used by a transaction iif: 620 // ModificationTime <= transaction.Timestamp < expiration. 621 type tableSet struct { 622 data []*tableVersionState 623 } 624 625 func (l *tableSet) String() string { 626 var buf bytes.Buffer 627 for i, s := range l.data { 628 if i > 0 { 629 buf.WriteString(" ") 630 } 631 buf.WriteString(fmt.Sprintf("%d:%d", s.Version, s.expiration.WallTime)) 632 } 633 return buf.String() 634 } 635 636 func (l *tableSet) insert(s *tableVersionState) { 637 i, match := l.findIndex(s.Version) 638 if match { 639 panic("unable to insert duplicate lease") 640 } 641 if i == len(l.data) { 642 l.data = append(l.data, s) 643 return 644 } 645 l.data = append(l.data, nil) 646 copy(l.data[i+1:], l.data[i:]) 647 l.data[i] = s 648 } 649 650 func (l *tableSet) remove(s *tableVersionState) { 651 i, match := l.findIndex(s.Version) 652 if !match { 653 panic(fmt.Sprintf("can't find lease to remove: %s", s)) 654 } 655 l.data = append(l.data[:i], l.data[i+1:]...) 656 } 657 658 func (l *tableSet) find(version sqlbase.DescriptorVersion) *tableVersionState { 659 if i, match := l.findIndex(version); match { 660 return l.data[i] 661 } 662 return nil 663 } 664 665 func (l *tableSet) findIndex(version sqlbase.DescriptorVersion) (int, bool) { 666 i := sort.Search(len(l.data), func(i int) bool { 667 s := l.data[i] 668 return s.Version >= version 669 }) 670 if i < len(l.data) { 671 s := l.data[i] 672 if s.Version == version { 673 return i, true 674 } 675 } 676 return i, false 677 } 678 679 func (l *tableSet) findNewest() *tableVersionState { 680 if len(l.data) == 0 { 681 return nil 682 } 683 return l.data[len(l.data)-1] 684 } 685 686 func (l *tableSet) findVersion(version sqlbase.DescriptorVersion) *tableVersionState { 687 if len(l.data) == 0 { 688 return nil 689 } 690 // Find the index of the first lease with version > targetVersion. 691 i := sort.Search(len(l.data), func(i int) bool { 692 return l.data[i].Version > version 693 }) 694 if i == 0 { 695 return nil 696 } 697 // i-1 is the index of the newest lease for the previous version (the version 698 // we're looking for). 699 s := l.data[i-1] 700 if s.Version == version { 701 return s 702 } 703 return nil 704 } 705 706 type tableState struct { 707 id sqlbase.ID 708 stopper *stop.Stopper 709 710 // renewalInProgress is an atomic indicator for when a renewal for a 711 // lease has begun. This is atomic to prevent multiple routines from 712 // entering renewal initialization. 713 renewalInProgress int32 714 715 mu struct { 716 syncutil.Mutex 717 718 // table descriptors sorted by increasing version. This set always 719 // contains a table descriptor version with a lease as the latest 720 // entry. There may be more than one active lease when the system is 721 // transitioning from one version of the descriptor to another or 722 // when the node preemptively acquires a new lease for a version 723 // when the old lease has not yet expired. In the latter case, a new 724 // entry is created with the expiration time of the new lease and 725 // the older entry is removed. 726 active tableSet 727 // Indicates that the table has been dropped, or is being dropped. 728 // If set, leases are released from the store as soon as their 729 // refcount drops to 0, as opposed to waiting until they expire. 730 dropped bool 731 732 // acquisitionsInProgress indicates that at least one caller is currently 733 // in the process of performing an acquisition. This tracking is critical 734 // to ensure that notifications of new versions which arrive before a lease 735 // acquisition finishes but indicate that that new lease is expired are not 736 // ignored. 737 acquisitionsInProgress int 738 } 739 } 740 741 // ensureVersion ensures that the latest version >= minVersion. It will 742 // check if the latest known version meets the criterion, or attempt to 743 // acquire a lease at the latest version with the hope that it meets 744 // the criterion. 745 func ensureVersion( 746 ctx context.Context, tableID sqlbase.ID, minVersion sqlbase.DescriptorVersion, m *Manager, 747 ) error { 748 if s := m.findNewest(tableID); s != nil && minVersion <= s.Version { 749 return nil 750 } 751 752 if err := m.AcquireFreshestFromStore(ctx, tableID); err != nil { 753 return err 754 } 755 756 if s := m.findNewest(tableID); s != nil && s.Version < minVersion { 757 return errors.Errorf("version %d for table %s does not exist yet", minVersion, s.Name) 758 } 759 return nil 760 } 761 762 // findForTimestamp finds a table descriptor valid for the timestamp. 763 // In the most common case the timestamp passed to this method is close 764 // to the current time and in all likelihood the latest version of a 765 // table descriptor if valid is returned. 766 // 767 // This returns errRenewLease when there is no table descriptor cached 768 // or the latest descriptor version's ModificationTime satisfies the 769 // timestamp while it's expiration time doesn't satisfy the timestamp. 770 // This is an optimistic strategy betting that in all likelihood a 771 // higher layer renewing the lease on the descriptor and populating 772 // tableState will satisfy the timestamp on a subsequent call. 773 // 774 // In all other circumstances where a descriptor cannot be found for the 775 // timestamp errOlderReadTableVersion is returned requesting a higher layer 776 // to populate the tableState with a valid older version of the descriptor 777 // before calling. 778 // 779 // The refcount for the returned tableVersionState is incremented. 780 // It returns true if the descriptor returned is the known latest version 781 // of the descriptor. 782 func (t *tableState) findForTimestamp( 783 ctx context.Context, timestamp hlc.Timestamp, 784 ) (*tableVersionState, bool, error) { 785 t.mu.Lock() 786 defer t.mu.Unlock() 787 788 // Acquire a lease if no table descriptor exists in the cache. 789 if len(t.mu.active.data) == 0 { 790 return nil, false, errRenewLease 791 } 792 793 // Walk back the versions to find one that is valid for the timestamp. 794 for i := len(t.mu.active.data) - 1; i >= 0; i-- { 795 // Check to see if the ModificationTime is valid. 796 if table := t.mu.active.data[i]; table.ModificationTime.LessEq(timestamp) { 797 latest := i+1 == len(t.mu.active.data) 798 if !table.hasExpired(timestamp) { 799 // Existing valid table version. 800 table.incRefcount() 801 return table, latest, nil 802 } 803 804 if latest { 805 // Renew the lease if the lease has expired 806 // The latest descriptor always has a lease. 807 return nil, false, errRenewLease 808 } 809 break 810 } 811 } 812 813 return nil, false, errReadOlderTableVersion 814 } 815 816 // Read an older table descriptor version for the particular timestamp 817 // from the store. We unfortunately need to read more than one table 818 // version just so that we can set the expiration time on the descriptor 819 // properly. 820 // 821 // TODO(vivek): Future work: 822 // 1. Read multiple versions of a descriptor through one kv call. 823 // 2. Translate multiple simultaneous calls to this method into a single call 824 // as is done for acquireNodeLease(). 825 // 3. Figure out a sane policy on when these descriptors should be purged. 826 // They are currently purged in PurgeOldVersions. 827 func (m *Manager) readOlderVersionForTimestamp( 828 ctx context.Context, tableID sqlbase.ID, timestamp hlc.Timestamp, 829 ) ([]*tableVersionState, error) { 830 expiration, done := func() (hlc.Timestamp, bool) { 831 t := m.findTableState(tableID, false /* create */) 832 t.mu.Lock() 833 defer t.mu.Unlock() 834 afterIdx := 0 835 // Walk back the versions to find one that is valid for the timestamp. 836 for i := len(t.mu.active.data) - 1; i >= 0; i-- { 837 // Check to see if the ModificationTime is valid. 838 if table := t.mu.active.data[i]; table.ModificationTime.LessEq(timestamp) { 839 if timestamp.Less(table.expiration) { 840 // Existing valid table version. 841 return table.expiration, true 842 } 843 // We need a version after data[i], but before data[i+1]. 844 // We could very well use the timestamp to read the table 845 // descriptor, but unfortunately we will not be able to assign 846 // it a proper expiration time. Therefore, we read table 847 // descriptors versions one by one from afterIdx back into the 848 // past until we find a valid one. 849 afterIdx = i + 1 850 break 851 } 852 } 853 854 if afterIdx == len(t.mu.active.data) { 855 return hlc.Timestamp{}, true 856 } 857 858 // Read table descriptor versions one by one into the past until we 859 // find a valid one. Every version is assigned an expiration time that 860 // is the ModificationTime of the previous one read. 861 return t.mu.active.data[afterIdx].ModificationTime, false 862 }() 863 if done { 864 return nil, nil 865 } 866 867 // Read descriptors from the store. 868 var versions []*tableVersionState 869 for { 870 table, err := m.Storage.getForExpiration(ctx, expiration, tableID) 871 if err != nil { 872 return nil, err 873 } 874 versions = append(versions, table) 875 if table.ModificationTime.LessEq(timestamp) { 876 break 877 } 878 // Set the expiration time for the next table. 879 expiration = table.ModificationTime 880 } 881 882 return versions, nil 883 } 884 885 // Insert table versions. The versions provided are not in 886 // any particular order. 887 func (m *Manager) insertTableVersions(tableID sqlbase.ID, versions []*tableVersionState) { 888 t := m.findTableState(tableID, false /* create */) 889 t.mu.Lock() 890 defer t.mu.Unlock() 891 for _, tableVersion := range versions { 892 // Since we gave up the lock while reading the versions from 893 // the store we have to ensure that no one else inserted the 894 // same table version. 895 table := t.mu.active.findVersion(tableVersion.Version) 896 if table == nil { 897 t.mu.active.insert(tableVersion) 898 } 899 } 900 } 901 902 // AcquireFreshestFromStore acquires a new lease from the store and 903 // inserts it into the active set. It guarantees that the lease returned is 904 // the one acquired after the call is made. Use this if the lease we want to 905 // get needs to see some descriptor updates that we know happened recently. 906 func (m *Manager) AcquireFreshestFromStore(ctx context.Context, tableID sqlbase.ID) error { 907 // Create tableState if needed. 908 _ = m.findTableState(tableID, true /* create */) 909 // We need to acquire a lease on a "fresh" descriptor, meaning that joining 910 // a potential in-progress lease acquisition is generally not good enough. 911 // If we are to join an in-progress acquisition, it needs to be an acquisition 912 // initiated after this point. 913 // So, we handle two cases: 914 // 1. The first DoChan() call tells us that we didn't join an in-progress 915 // acquisition. Great, the lease that's being acquired is good. 916 // 2. The first DoChan() call tells us that we did join an in-progress acq. 917 // We have to wait this acquisition out; it's not good for us. But any 918 // future acquisition is good, so the next time around the loop it doesn't 919 // matter if we initiate a request or join an in-progress one. 920 // In both cases, we need to check if the lease we want is still valid because 921 // lease acquisition is done without holding the tableState lock, so anything 922 // can happen in between lease acquisition and us getting control again. 923 attemptsMade := 0 924 for { 925 // Acquire a fresh table lease. 926 didAcquire, err := acquireNodeLease(ctx, m, tableID) 927 if m.testingKnobs.LeaseStoreTestingKnobs.LeaseAcquireResultBlockEvent != nil { 928 m.testingKnobs.LeaseStoreTestingKnobs.LeaseAcquireResultBlockEvent(AcquireFreshestBlock) 929 } 930 if err != nil { 931 return err 932 } 933 934 if didAcquire { 935 // Case 1: we didn't join an in-progress call and the lease is still 936 // valid. 937 break 938 } else if attemptsMade > 1 { 939 // Case 2: more than one acquisition has happened and the lease is still 940 // valid. 941 break 942 } 943 attemptsMade++ 944 } 945 return nil 946 } 947 948 // upsertLocked inserts a lease for a particular table version. 949 // If an existing lease exists for the table version it replaces 950 // it and returns it. 951 func (t *tableState) upsertLocked( 952 ctx context.Context, table *tableVersionState, 953 ) (_ *storedTableLease, _ error) { 954 s := t.mu.active.find(table.Version) 955 if s == nil { 956 if t.mu.active.findNewest() != nil { 957 log.Infof(ctx, "new lease: %s", table) 958 } 959 t.mu.active.insert(table) 960 return nil, nil 961 } 962 963 // The table is replacing an existing one at the same version. 964 if !table.hasValidExpiration(s) { 965 // This is a violation of an invariant and can actually not 966 // happen. We return an error here to aid in further investigations. 967 return nil, errors.Errorf("lease expiration monotonicity violation, (%s) vs (%s)", s, table) 968 } 969 970 s.mu.Lock() 971 table.mu.Lock() 972 // subsume the refcount of the older lease. This is permitted because 973 // the new lease has a greater expiration than the older lease and 974 // any transaction using the older lease can safely use a deadline set 975 // to the older lease's expiration even though the older lease is 976 // released! This is because the new lease is valid at the same table 977 // version at a greater expiration. 978 table.mu.refcount += s.mu.refcount 979 s.mu.refcount = 0 980 l := s.mu.lease 981 s.mu.lease = nil 982 if log.V(2) { 983 log.VEventf(ctx, 2, "replaced lease: %s with %s", s.stringLocked(), table.stringLocked()) 984 } 985 table.mu.Unlock() 986 s.mu.Unlock() 987 t.mu.active.remove(s) 988 t.mu.active.insert(table) 989 return l, nil 990 } 991 992 // removeInactiveVersions removes inactive versions in t.mu.active.data with refcount 0. 993 // t.mu must be locked. It returns table version state that need to be released. 994 func (t *tableState) removeInactiveVersions() []*storedTableLease { 995 var leases []*storedTableLease 996 // A copy of t.mu.active.data must be made since t.mu.active.data will be changed 997 // within the loop. 998 for _, table := range append([]*tableVersionState(nil), t.mu.active.data...) { 999 func() { 1000 table.mu.Lock() 1001 defer table.mu.Unlock() 1002 if table.mu.refcount == 0 { 1003 t.mu.active.remove(table) 1004 if l := table.mu.lease; l != nil { 1005 table.mu.lease = nil 1006 leases = append(leases, l) 1007 } 1008 } 1009 }() 1010 } 1011 return leases 1012 } 1013 1014 // If the lease cannot be obtained because the descriptor is in the process of 1015 // being dropped or offline, the error will be of type inactiveTableError. 1016 // The boolean returned is true if this call was actually responsible for the 1017 // lease acquisition. 1018 func acquireNodeLease(ctx context.Context, m *Manager, id sqlbase.ID) (bool, error) { 1019 var toRelease *storedTableLease 1020 resultChan, didAcquire := m.group.DoChan(fmt.Sprintf("acquire%d", id), func() (interface{}, error) { 1021 // Note that we use a new `context` here to avoid a situation where a cancellation 1022 // of the first context cancels other callers to the `acquireNodeLease()` method, 1023 // because of its use of `singleflight.Group`. See issue #41780 for how this has 1024 // happened. 1025 newCtx, cancel := m.stopper.WithCancelOnQuiesce(logtags.WithTags(context.Background(), logtags.FromContext(ctx))) 1026 defer cancel() 1027 if m.isDraining() { 1028 return nil, errors.New("cannot acquire lease when draining") 1029 } 1030 newest := m.findNewest(id) 1031 var minExpiration hlc.Timestamp 1032 if newest != nil { 1033 minExpiration = newest.expiration 1034 } 1035 table, err := m.Storage.acquire(newCtx, minExpiration, id) 1036 if err != nil { 1037 return nil, err 1038 } 1039 t := m.findTableState(id, false /* create */) 1040 t.mu.Lock() 1041 defer t.mu.Unlock() 1042 toRelease, err = t.upsertLocked(newCtx, table) 1043 if err != nil { 1044 return nil, err 1045 } 1046 m.tableNames.insert(table) 1047 if toRelease != nil { 1048 releaseLease(toRelease, m) 1049 } 1050 return leaseToken(table), nil 1051 }) 1052 select { 1053 case <-ctx.Done(): 1054 return false, ctx.Err() 1055 case result := <-resultChan: 1056 if result.Err != nil { 1057 return false, result.Err 1058 } 1059 } 1060 return didAcquire, nil 1061 } 1062 1063 // release returns a tableVersionState that needs to be released from 1064 // the store. 1065 func (t *tableState) release( 1066 table *sqlbase.ImmutableTableDescriptor, removeOnceDereferenced bool, 1067 ) (*storedTableLease, error) { 1068 t.mu.Lock() 1069 defer t.mu.Unlock() 1070 1071 s := t.mu.active.find(table.Version) 1072 if s == nil { 1073 return nil, errors.Errorf("table %d version %d not found", table.ID, table.Version) 1074 } 1075 // Decrements the refcount and returns true if the lease has to be removed 1076 // from the store. 1077 decRefcount := func(s *tableVersionState) *storedTableLease { 1078 // Figure out if we'd like to remove the lease from the store asap (i.e. 1079 // when the refcount drops to 0). If so, we'll need to mark the lease as 1080 // invalid. 1081 removeOnceDereferenced = removeOnceDereferenced || 1082 // Release from the store if the table has been dropped; no leases 1083 // can be acquired any more. 1084 t.mu.dropped || 1085 // Release from the store if the lease is not for the latest 1086 // version; only leases for the latest version can be acquired. 1087 s != t.mu.active.findNewest() 1088 1089 s.mu.Lock() 1090 defer s.mu.Unlock() 1091 s.mu.refcount-- 1092 if log.V(2) { 1093 log.VEventf(context.TODO(), 2, "release: %s", s.stringLocked()) 1094 } 1095 if s.mu.refcount < 0 { 1096 panic(fmt.Sprintf("negative ref count: %s", s)) 1097 } 1098 1099 if s.mu.refcount == 0 && s.mu.lease != nil && removeOnceDereferenced { 1100 l := s.mu.lease 1101 s.mu.lease = nil 1102 return l 1103 } 1104 return nil 1105 } 1106 if l := decRefcount(s); l != nil { 1107 t.mu.active.remove(s) 1108 return l, nil 1109 } 1110 return nil, nil 1111 } 1112 1113 // releaseLease from store. 1114 func releaseLease(lease *storedTableLease, m *Manager) { 1115 ctx := context.TODO() 1116 if m.isDraining() { 1117 // Release synchronously to guarantee release before exiting. 1118 m.Storage.release(ctx, m.stopper, lease) 1119 return 1120 } 1121 1122 // Release to the store asynchronously, without the tableState lock. 1123 if err := m.stopper.RunAsyncTask( 1124 ctx, "sql.tableState: releasing descriptor lease", 1125 func(ctx context.Context) { 1126 m.Storage.release(ctx, m.stopper, lease) 1127 }); err != nil { 1128 log.Warningf(ctx, "error: %s, not releasing lease: %q", err, lease) 1129 } 1130 } 1131 1132 // purgeOldVersions removes old unused table descriptor versions older than 1133 // minVersion and releases any associated leases. 1134 // If takenOffline is set, minVersion is ignored; no lease is acquired and all 1135 // existing unused versions are removed. The table is further marked dropped, 1136 // which will cause existing in-use leases to be eagerly released once 1137 // they're not in use any more. 1138 // If t has no active leases, nothing is done. 1139 func purgeOldVersions( 1140 ctx context.Context, 1141 db *kv.DB, 1142 id sqlbase.ID, 1143 takenOffline bool, 1144 minVersion sqlbase.DescriptorVersion, 1145 m *Manager, 1146 ) error { 1147 t := m.findTableState(id, false /*create*/) 1148 if t == nil { 1149 return nil 1150 } 1151 t.mu.Lock() 1152 empty := len(t.mu.active.data) == 0 && t.mu.acquisitionsInProgress == 0 1153 t.mu.Unlock() 1154 if empty { 1155 // We don't currently have a version on this table, so no need to refresh 1156 // anything. 1157 return nil 1158 } 1159 1160 removeInactives := func(drop bool) { 1161 t.mu.Lock() 1162 t.mu.dropped = drop 1163 leases := t.removeInactiveVersions() 1164 t.mu.Unlock() 1165 for _, l := range leases { 1166 releaseLease(l, m) 1167 } 1168 } 1169 1170 if takenOffline { 1171 removeInactives(takenOffline) 1172 return nil 1173 } 1174 1175 if err := ensureVersion(ctx, id, minVersion, m); err != nil { 1176 return err 1177 } 1178 1179 // Acquire a refcount on the table on the latest version to maintain an 1180 // active lease, so that it doesn't get released when removeInactives() 1181 // is called below. Release this lease after calling removeInactives(). 1182 table, _, err := t.findForTimestamp(ctx, m.clock.Now()) 1183 if isInactive := sqlbase.HasInactiveTableError(err); err == nil || isInactive { 1184 removeInactives(isInactive) 1185 if table != nil { 1186 s, err := t.release(&table.ImmutableTableDescriptor, m.removeOnceDereferenced()) 1187 if err != nil { 1188 return err 1189 } 1190 if s != nil { 1191 releaseLease(s, m) 1192 } 1193 return nil 1194 } 1195 return nil 1196 } 1197 return err 1198 } 1199 1200 // maybeQueueLeaseRenewal queues a lease renewal if there is not already a lease 1201 // renewal in progress. 1202 func (t *tableState) maybeQueueLeaseRenewal( 1203 ctx context.Context, m *Manager, tableID sqlbase.ID, tableName string, 1204 ) error { 1205 if !atomic.CompareAndSwapInt32(&t.renewalInProgress, 0, 1) { 1206 return nil 1207 } 1208 1209 // Start the renewal. When it finishes, it will reset t.renewalInProgress. 1210 return t.stopper.RunAsyncTask(context.Background(), 1211 "lease renewal", func(ctx context.Context) { 1212 var cleanup func() 1213 ctx, cleanup = tracing.EnsureContext(ctx, m.ambientCtx.Tracer, "lease renewal") 1214 defer cleanup() 1215 t.startLeaseRenewal(ctx, m, tableID, tableName) 1216 }) 1217 } 1218 1219 // startLeaseRenewal starts a singleflight.Group to acquire a lease. 1220 // This function blocks until lease acquisition completes. 1221 // t.renewalInProgress must be set to 1 before calling. 1222 func (t *tableState) startLeaseRenewal( 1223 ctx context.Context, m *Manager, tableID sqlbase.ID, tableName string, 1224 ) { 1225 log.VEventf(ctx, 1, 1226 "background lease renewal beginning for tableID=%d tableName=%q", 1227 tableID, tableName) 1228 if _, err := acquireNodeLease(ctx, m, tableID); err != nil { 1229 log.Errorf(ctx, 1230 "background lease renewal for tableID=%d tableName=%q failed: %s", 1231 tableID, tableName, err) 1232 } else { 1233 log.VEventf(ctx, 1, 1234 "background lease renewal finished for tableID=%d tableName=%q", 1235 tableID, tableName) 1236 } 1237 atomic.StoreInt32(&t.renewalInProgress, 0) 1238 } 1239 1240 // markAcquisitionStart increments the acquisitionsInProgress counter. 1241 func (t *tableState) markAcquisitionStart(ctx context.Context) { 1242 t.mu.Lock() 1243 defer t.mu.Unlock() 1244 t.mu.acquisitionsInProgress++ 1245 } 1246 1247 // markAcquisitionDone decrements the acquisitionsInProgress counter. 1248 func (t *tableState) markAcquisitionDone(ctx context.Context) { 1249 t.mu.Lock() 1250 defer t.mu.Unlock() 1251 t.mu.acquisitionsInProgress-- 1252 } 1253 1254 // AcquireBlockType is the type of blocking result event when 1255 // calling LeaseAcquireResultBlockEvent. 1256 type AcquireBlockType int 1257 1258 const ( 1259 // AcquireBlock denotes the LeaseAcquireResultBlockEvent is 1260 // coming from tableState.acquire(). 1261 AcquireBlock AcquireBlockType = iota 1262 // AcquireFreshestBlock denotes the LeaseAcquireResultBlockEvent is 1263 // from tableState.acquireFreshestFromStore(). 1264 AcquireFreshestBlock 1265 ) 1266 1267 // StorageTestingKnobs contains testing knobs. 1268 type StorageTestingKnobs struct { 1269 // Called after a lease is removed from the store, with any operation error. 1270 // See LeaseRemovalTracker. 1271 LeaseReleasedEvent func(id sqlbase.ID, version sqlbase.DescriptorVersion, err error) 1272 // Called after a lease is acquired, with any operation error. 1273 LeaseAcquiredEvent func(table sqlbase.TableDescriptor, err error) 1274 // Called before waiting on a results from a DoChan call of acquireNodeLease 1275 // in tableState.acquire() and tableState.acquireFreshestFromStore(). 1276 LeaseAcquireResultBlockEvent func(leaseBlockType AcquireBlockType) 1277 // RemoveOnceDereferenced forces leases to be removed 1278 // as soon as they are dereferenced. 1279 RemoveOnceDereferenced bool 1280 } 1281 1282 // ModuleTestingKnobs is part of the base.ModuleTestingKnobs interface. 1283 func (*StorageTestingKnobs) ModuleTestingKnobs() {} 1284 1285 var _ base.ModuleTestingKnobs = &StorageTestingKnobs{} 1286 1287 // ManagerTestingKnobs contains test knobs. 1288 type ManagerTestingKnobs struct { 1289 1290 // A callback called after the leases are refreshed as a result of a gossip update. 1291 TestingTableRefreshedEvent func(descriptor *sqlbase.TableDescriptor) 1292 1293 // TestingTableUpdateEvent is a callback when an update is received, before 1294 // the leases are refreshed. If a non-nil error is returned, the update is 1295 // ignored. 1296 TestingTableUpdateEvent func(descriptor *sqlbase.TableDescriptor) error 1297 1298 // To disable the deletion of orphaned leases at server startup. 1299 DisableDeleteOrphanedLeases bool 1300 1301 // AlwaysUseRangefeeds ensures that rangefeeds and not gossip are used to 1302 // detect changes to table descriptors. 1303 AlwaysUseRangefeeds bool 1304 1305 // VersionPollIntervalForRangefeeds controls the polling interval for the 1306 // check whether the requisite version for rangefeed-based notifications has 1307 // been finalized. 1308 // 1309 // TODO(ajwerner): Remove this and replace it with a callback. 1310 VersionPollIntervalForRangefeeds time.Duration 1311 1312 LeaseStoreTestingKnobs StorageTestingKnobs 1313 } 1314 1315 var _ base.ModuleTestingKnobs = &ManagerTestingKnobs{} 1316 1317 // ModuleTestingKnobs is part of the base.ModuleTestingKnobs interface. 1318 func (*ManagerTestingKnobs) ModuleTestingKnobs() {} 1319 1320 type tableNameCacheKey struct { 1321 dbID sqlbase.ID 1322 schemaID sqlbase.ID 1323 normalizeTabledName string 1324 } 1325 1326 // tableNameCache is a cache of table name -> latest table version mappings. 1327 // The Manager updates the cache every time a lease is acquired or released 1328 // from the store. The cache maintains the latest version for each table name. 1329 // All methods are thread-safe. 1330 type tableNameCache struct { 1331 mu syncutil.Mutex 1332 tables map[tableNameCacheKey]*tableVersionState 1333 } 1334 1335 // Resolves a (database ID, table name) to the table descriptor's ID. 1336 // Returns a valid tableVersionState for the table with that name, 1337 // if the name had been previously cached and the cache has a table 1338 // version that has not expired. Returns nil otherwise. 1339 // This method handles normalizing the table name. 1340 // The table's refcount is incremented before returning, so the caller 1341 // is responsible for releasing it to the leaseManager. 1342 func (c *tableNameCache) get( 1343 dbID sqlbase.ID, schemaID sqlbase.ID, tableName string, timestamp hlc.Timestamp, 1344 ) *tableVersionState { 1345 c.mu.Lock() 1346 table, ok := c.tables[makeTableNameCacheKey(dbID, schemaID, tableName)] 1347 c.mu.Unlock() 1348 if !ok { 1349 return nil 1350 } 1351 table.mu.Lock() 1352 if table.mu.lease == nil { 1353 table.mu.Unlock() 1354 // This get() raced with a release operation. Remove this cache 1355 // entry if needed. 1356 c.remove(table) 1357 return nil 1358 } 1359 1360 defer table.mu.Unlock() 1361 1362 if !NameMatchesTable( 1363 &table.ImmutableTableDescriptor.TableDescriptor, 1364 dbID, 1365 schemaID, 1366 tableName, 1367 ) { 1368 panic(fmt.Sprintf("Out of sync entry in the name cache. "+ 1369 "Cache entry: %d.%q -> %d. Lease: %d.%q.", 1370 dbID, tableName, table.ID, table.ParentID, table.Name)) 1371 } 1372 1373 // Expired table. Don't hand it out. 1374 if table.hasExpired(timestamp) { 1375 return nil 1376 } 1377 1378 table.incRefcountLocked() 1379 return table 1380 } 1381 1382 func (c *tableNameCache) insert(table *tableVersionState) { 1383 c.mu.Lock() 1384 defer c.mu.Unlock() 1385 1386 key := makeTableNameCacheKey(table.ParentID, table.GetParentSchemaID(), table.Name) 1387 existing, ok := c.tables[key] 1388 if !ok { 1389 c.tables[key] = table 1390 return 1391 } 1392 // If we already have a lease in the cache for this name, see if this one is 1393 // better (higher version or later expiration). 1394 if table.Version > existing.Version || 1395 (table.Version == existing.Version && table.hasValidExpiration(existing)) { 1396 // Overwrite the old table. The new one is better. From now on, we want 1397 // clients to use the new one. 1398 c.tables[key] = table 1399 } 1400 } 1401 1402 func (c *tableNameCache) remove(table *tableVersionState) { 1403 c.mu.Lock() 1404 defer c.mu.Unlock() 1405 1406 key := makeTableNameCacheKey(table.ParentID, table.GetParentSchemaID(), table.Name) 1407 existing, ok := c.tables[key] 1408 if !ok { 1409 // Table for lease not found in table name cache. This can happen if we had 1410 // a more recent lease on the table in the tableNameCache, then the table 1411 // gets dropped, then the more recent lease is remove()d - which clears the 1412 // cache. 1413 return 1414 } 1415 // If this was the lease that the cache had for the table name, remove it. 1416 // If the cache had some other table, this remove is a no-op. 1417 if existing == table { 1418 delete(c.tables, key) 1419 } 1420 } 1421 1422 func makeTableNameCacheKey( 1423 dbID sqlbase.ID, schemaID sqlbase.ID, tableName string, 1424 ) tableNameCacheKey { 1425 return tableNameCacheKey{dbID, schemaID, tableName} 1426 } 1427 1428 // Manager manages acquiring and releasing per-table leases. It also 1429 // handles resolving table names to descriptor IDs. The leases are managed 1430 // internally with a table descriptor and expiration time exported by the 1431 // API. The table descriptor acquired needs to be released. A transaction 1432 // can use a table descriptor as long as its timestamp is within the 1433 // validity window for the descriptor: 1434 // descriptor.ModificationTime <= txn.Timestamp < expirationTime 1435 // 1436 // Exported only for testing. 1437 // 1438 // The locking order is: 1439 // Manager.mu > tableState.mu > tableNameCache.mu > tableVersionState.mu 1440 type Manager struct { 1441 Storage 1442 mu struct { 1443 syncutil.Mutex 1444 tables map[sqlbase.ID]*tableState 1445 1446 // updatesResolvedTimestamp keeps track of a timestamp before which all 1447 // table updates have already been seen. 1448 updatesResolvedTimestamp hlc.Timestamp 1449 } 1450 1451 draining atomic.Value 1452 1453 // tableNames is a cache for name -> id mappings. A mapping for the cache 1454 // should only be used if we currently have an active lease on the respective 1455 // id; otherwise, the mapping may well be stale. 1456 // Not protected by mu. 1457 tableNames tableNameCache 1458 testingKnobs ManagerTestingKnobs 1459 ambientCtx log.AmbientContext 1460 stopper *stop.Stopper 1461 sem *quotapool.IntPool 1462 } 1463 1464 const leaseConcurrencyLimit = 5 1465 1466 // NewLeaseManager creates a new Manager. 1467 // 1468 // internalExecutor can be nil to help bootstrapping, but then it needs to be set via 1469 // SetInternalExecutor before the Manager is used. 1470 // 1471 // stopper is used to run async tasks. Can be nil in tests. 1472 func NewLeaseManager( 1473 ambientCtx log.AmbientContext, 1474 nodeIDContainer *base.SQLIDContainer, 1475 db *kv.DB, 1476 clock *hlc.Clock, 1477 internalExecutor sqlutil.InternalExecutor, 1478 settings *cluster.Settings, 1479 codec keys.SQLCodec, 1480 testingKnobs ManagerTestingKnobs, 1481 stopper *stop.Stopper, 1482 cfg *base.LeaseManagerConfig, 1483 ) *Manager { 1484 lm := &Manager{ 1485 Storage: Storage{ 1486 nodeIDContainer: nodeIDContainer, 1487 db: db, 1488 clock: clock, 1489 internalExecutor: internalExecutor, 1490 settings: settings, 1491 codec: codec, 1492 group: &singleflight.Group{}, 1493 leaseDuration: cfg.TableDescriptorLeaseDuration, 1494 leaseJitterFraction: cfg.TableDescriptorLeaseJitterFraction, 1495 leaseRenewalTimeout: cfg.TableDescriptorLeaseRenewalTimeout, 1496 testingKnobs: testingKnobs.LeaseStoreTestingKnobs, 1497 }, 1498 testingKnobs: testingKnobs, 1499 tableNames: tableNameCache{ 1500 tables: make(map[tableNameCacheKey]*tableVersionState), 1501 }, 1502 ambientCtx: ambientCtx, 1503 stopper: stopper, 1504 sem: quotapool.NewIntPool("lease manager", leaseConcurrencyLimit), 1505 } 1506 lm.stopper.AddCloser(lm.sem.Closer("stopper")) 1507 lm.mu.tables = make(map[sqlbase.ID]*tableState) 1508 lm.mu.updatesResolvedTimestamp = db.Clock().Now() 1509 1510 lm.draining.Store(false) 1511 return lm 1512 } 1513 1514 // NameMatchesTable returns true if the provided name and IDs match this 1515 // descriptor. 1516 func NameMatchesTable( 1517 table *sqlbase.TableDescriptor, dbID sqlbase.ID, schemaID sqlbase.ID, tableName string, 1518 ) bool { 1519 return table.ParentID == dbID && table.Name == tableName && 1520 table.GetParentSchemaID() == schemaID 1521 } 1522 1523 // findNewest returns the newest table version state for the tableID. 1524 func (m *Manager) findNewest(tableID sqlbase.ID) *tableVersionState { 1525 t := m.findTableState(tableID, false /* create */) 1526 t.mu.Lock() 1527 defer t.mu.Unlock() 1528 return t.mu.active.findNewest() 1529 } 1530 1531 // AcquireByName returns a table version for the specified table valid for 1532 // the timestamp. It returns the table descriptor and a expiration time. 1533 // A transaction using this descriptor must ensure that its 1534 // commit-timestamp < expiration-time. Care must be taken to not modify 1535 // the returned descriptor. Renewal of a lease may begin in the 1536 // background. Renewal is done in order to prevent blocking on future 1537 // acquisitions. 1538 // 1539 // Known limitation: AcquireByName() calls Acquire() and therefore suffers 1540 // from the same limitation as Acquire (See Acquire). AcquireByName() is 1541 // unable to function correctly on a timestamp less than the timestamp 1542 // of a transaction with a DROP/TRUNCATE on a table. The limitation in 1543 // the face of a DROP follows directly from the limitation on Acquire(). 1544 // A TRUNCATE is implemented by changing the name -> id mapping for a table 1545 // and by dropping the descriptor with the old id. While AcquireByName 1546 // can use the timestamp and get the correct name->id mapping at a 1547 // timestamp, it uses Acquire() to get a descriptor with the corresponding 1548 // id and fails because the id has been dropped by the TRUNCATE. 1549 func (m *Manager) AcquireByName( 1550 ctx context.Context, 1551 timestamp hlc.Timestamp, 1552 dbID sqlbase.ID, 1553 schemaID sqlbase.ID, 1554 tableName string, 1555 ) (*sqlbase.ImmutableTableDescriptor, hlc.Timestamp, error) { 1556 // Check if we have cached an ID for this name. 1557 tableVersion := m.tableNames.get(dbID, schemaID, tableName, timestamp) 1558 if tableVersion != nil { 1559 if tableVersion.ModificationTime.LessEq(timestamp) { 1560 // If this lease is nearly expired, ensure a renewal is queued. 1561 durationUntilExpiry := time.Duration(tableVersion.expiration.WallTime - timestamp.WallTime) 1562 if durationUntilExpiry < m.Storage.leaseRenewalTimeout { 1563 if t := m.findTableState(tableVersion.ID, false /* create */); t != nil { 1564 if err := t.maybeQueueLeaseRenewal( 1565 ctx, m, tableVersion.ID, tableName); err != nil { 1566 return nil, hlc.Timestamp{}, err 1567 } 1568 } 1569 } 1570 return &tableVersion.ImmutableTableDescriptor, tableVersion.expiration, nil 1571 } 1572 if err := m.Release(&tableVersion.ImmutableTableDescriptor); err != nil { 1573 return nil, hlc.Timestamp{}, err 1574 } 1575 // Return a valid table descriptor for the timestamp. 1576 table, expiration, err := m.Acquire(ctx, timestamp, tableVersion.ID) 1577 if err != nil { 1578 return nil, hlc.Timestamp{}, err 1579 } 1580 return table, expiration, nil 1581 } 1582 1583 // We failed to find something in the cache, or what we found is not 1584 // guaranteed to be valid by the time we use it because we don't have a 1585 // lease with at least a bit of lifetime left in it. So, we do it the hard 1586 // way: look in the database to resolve the name, then acquire a new table. 1587 var err error 1588 tableID, err := m.resolveName(ctx, timestamp, dbID, schemaID, tableName) 1589 if err != nil { 1590 return nil, hlc.Timestamp{}, err 1591 } 1592 table, expiration, err := m.Acquire(ctx, timestamp, tableID) 1593 if err != nil { 1594 return nil, hlc.Timestamp{}, err 1595 } 1596 if !NameMatchesTable(&table.TableDescriptor, dbID, schemaID, tableName) { 1597 // We resolved name `tableName`, but the lease has a different name in it. 1598 // That can mean two things. Assume the table is being renamed from A to B. 1599 // a) `tableName` is A. The transaction doing the RENAME committed (so the 1600 // descriptor has been updated to B), but its schema changer has not 1601 // finished yet. B is the new name of the table, queries should use that. If 1602 // we already had a lease with name A, we would've allowed to use it (but we 1603 // don't, otherwise the cache lookup above would've given it to us). Since 1604 // we don't, let's not allow A to be used, given that the lease now has name 1605 // B in it. It'd be sketchy to allow A to be used with an inconsistent name 1606 // in the table. 1607 // 1608 // b) `tableName` is B. Like in a), the transaction doing the RENAME 1609 // committed (so the descriptor has been updated to B), but its schema 1610 // change has not finished yet. We still had a valid lease with name A in 1611 // it. What to do, what to do? We could allow name B to be used, but who 1612 // knows what consequences that would have, since its not consistent with 1613 // the table. We could say "table B not found", but that means that, until 1614 // the next gossip update, this node would not service queries for this 1615 // table under the name B. That's no bueno, as B should be available to be 1616 // used immediately after the RENAME transaction has committed. 1617 // The problem is that we have a lease that we know is stale (the descriptor 1618 // in the DB doesn't necessarily have a new version yet, but it definitely 1619 // has a new name). So, lets force getting a fresh table. 1620 // This case (modulo the "committed" part) also applies when the txn doing a 1621 // RENAME had a lease on the old name, and then tries to use the new name 1622 // after the RENAME statement. 1623 // 1624 // How do we disambiguate between the a) and b)? We get a fresh lease on 1625 // the descriptor, as required by b), and then we'll know if we're trying to 1626 // resolve the current or the old name. 1627 // 1628 // TODO(vivek): check if the entire above comment is indeed true. Review the 1629 // use of NameMatchesTable() throughout this function. 1630 if err := m.Release(table); err != nil { 1631 log.Warningf(ctx, "error releasing lease: %s", err) 1632 } 1633 if err := m.AcquireFreshestFromStore(ctx, tableID); err != nil { 1634 return nil, hlc.Timestamp{}, err 1635 } 1636 table, expiration, err = m.Acquire(ctx, timestamp, tableID) 1637 if err != nil { 1638 return nil, hlc.Timestamp{}, err 1639 } 1640 if !NameMatchesTable(&table.TableDescriptor, dbID, schemaID, tableName) { 1641 // If the name we had doesn't match the newest descriptor in the DB, then 1642 // we're trying to use an old name. 1643 if err := m.Release(table); err != nil { 1644 log.Warningf(ctx, "error releasing lease: %s", err) 1645 } 1646 return nil, hlc.Timestamp{}, sqlbase.ErrDescriptorNotFound 1647 } 1648 } 1649 return table, expiration, nil 1650 } 1651 1652 // resolveName resolves a table name to a descriptor ID at a particular 1653 // timestamp by looking in the database. If the mapping is not found, 1654 // sqlbase.ErrDescriptorNotFound is returned. 1655 func (m *Manager) resolveName( 1656 ctx context.Context, 1657 timestamp hlc.Timestamp, 1658 dbID sqlbase.ID, 1659 schemaID sqlbase.ID, 1660 tableName string, 1661 ) (sqlbase.ID, error) { 1662 id := sqlbase.InvalidID 1663 if err := m.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 1664 // Run the name lookup as high-priority, thereby pushing any intents out of 1665 // its way. We don't want schema changes to prevent name resolution/lease 1666 // acquisitions; we'd rather force them to refresh. Also this prevents 1667 // deadlocks in cases where the name resolution is triggered by the 1668 // transaction doing the schema change itself. 1669 if err := txn.SetUserPriority(roachpb.MaxUserPriority); err != nil { 1670 return err 1671 } 1672 txn.SetFixedTimestamp(ctx, timestamp) 1673 var found bool 1674 var err error 1675 found, id, err = sqlbase.LookupObjectID(ctx, txn, m.codec, dbID, schemaID, tableName) 1676 if err != nil { 1677 return err 1678 } 1679 if !found { 1680 return nil 1681 } 1682 return nil 1683 }); err != nil { 1684 return id, err 1685 } 1686 if id == sqlbase.InvalidID { 1687 return id, sqlbase.ErrDescriptorNotFound 1688 } 1689 return id, nil 1690 } 1691 1692 // Acquire acquires a read lease for the specified table ID valid for 1693 // the timestamp. It returns the table descriptor and a expiration time. 1694 // A transaction using this descriptor must ensure that its 1695 // commit-timestamp < expiration-time. Care must be taken to not modify 1696 // the returned descriptor. 1697 // 1698 // Known limitation: Acquire() can return an error after the table with 1699 // the tableID has been dropped. This is true even when using a timestamp 1700 // less than the timestamp of the DROP command. This is because Acquire 1701 // can only return an older version of a descriptor if the latest version 1702 // can be leased; as it stands a dropped table cannot be leased. 1703 func (m *Manager) Acquire( 1704 ctx context.Context, timestamp hlc.Timestamp, tableID sqlbase.ID, 1705 ) (*sqlbase.ImmutableTableDescriptor, hlc.Timestamp, error) { 1706 for { 1707 t := m.findTableState(tableID, true /*create*/) 1708 table, latest, err := t.findForTimestamp(ctx, timestamp) 1709 if err == nil { 1710 // If the latest lease is nearly expired, ensure a renewal is queued. 1711 if latest { 1712 durationUntilExpiry := time.Duration(table.expiration.WallTime - timestamp.WallTime) 1713 if durationUntilExpiry < m.Storage.leaseRenewalTimeout { 1714 if err := t.maybeQueueLeaseRenewal(ctx, m, tableID, table.Name); err != nil { 1715 return nil, hlc.Timestamp{}, err 1716 } 1717 } 1718 } 1719 return &table.ImmutableTableDescriptor, table.expiration, nil 1720 } 1721 switch { 1722 case errors.Is(err, errRenewLease): 1723 if err := func() error { 1724 t.markAcquisitionStart(ctx) 1725 defer t.markAcquisitionDone(ctx) 1726 // Renew lease and retry. This will block until the lease is acquired. 1727 _, errLease := acquireNodeLease(ctx, m, tableID) 1728 return errLease 1729 }(); err != nil { 1730 return nil, hlc.Timestamp{}, err 1731 } 1732 1733 if m.testingKnobs.LeaseStoreTestingKnobs.LeaseAcquireResultBlockEvent != nil { 1734 m.testingKnobs.LeaseStoreTestingKnobs.LeaseAcquireResultBlockEvent(AcquireBlock) 1735 } 1736 1737 case errors.Is(err, errReadOlderTableVersion): 1738 // Read old table versions from the store. This can block while reading 1739 // old table versions from the store. 1740 versions, errRead := m.readOlderVersionForTimestamp(ctx, tableID, timestamp) 1741 if errRead != nil { 1742 return nil, hlc.Timestamp{}, errRead 1743 } 1744 m.insertTableVersions(tableID, versions) 1745 1746 default: 1747 return nil, hlc.Timestamp{}, err 1748 } 1749 } 1750 } 1751 1752 // Release releases a previously acquired table. 1753 func (m *Manager) Release(desc *sqlbase.ImmutableTableDescriptor) error { 1754 t := m.findTableState(desc.ID, false /* create */) 1755 if t == nil { 1756 return errors.Errorf("table %d not found", desc.ID) 1757 } 1758 // TODO(pmattis): Can/should we delete from Manager.tables if the 1759 // tableState becomes empty? 1760 // TODO(andrei): I think we never delete from Manager.tables... which 1761 // could be bad if a lot of tables keep being created. I looked into cleaning 1762 // up a bit, but it seems tricky to do with the current locking which is split 1763 // between Manager and tableState. 1764 l, err := t.release(desc, m.removeOnceDereferenced()) 1765 if err != nil { 1766 return err 1767 } 1768 if l != nil { 1769 releaseLease(l, m) 1770 } 1771 return nil 1772 } 1773 1774 // removeOnceDereferenced returns true if the Manager thinks 1775 // a tableVersionState can be removed after its refcount goes to 0. 1776 func (m *Manager) removeOnceDereferenced() bool { 1777 return m.Storage.testingKnobs.RemoveOnceDereferenced || 1778 // Release from the store if the Manager is draining. 1779 m.isDraining() 1780 } 1781 1782 func (m *Manager) isDraining() bool { 1783 return m.draining.Load().(bool) 1784 } 1785 1786 // SetDraining (when called with 'true') removes all inactive leases. Any leases 1787 // that are active will be removed once the lease's reference count drops to 0. 1788 // 1789 // The reporter callback, if non-nil, is called on a best effort basis 1790 // to report work that needed to be done and which may or may not have 1791 // been done by the time this call returns. See the explanation in 1792 // pkg/server/drain.go for details. 1793 func (m *Manager) SetDraining(drain bool, reporter func(int, string)) { 1794 m.draining.Store(drain) 1795 if !drain { 1796 return 1797 } 1798 1799 m.mu.Lock() 1800 defer m.mu.Unlock() 1801 for _, t := range m.mu.tables { 1802 t.mu.Lock() 1803 leases := t.removeInactiveVersions() 1804 t.mu.Unlock() 1805 for _, l := range leases { 1806 releaseLease(l, m) 1807 } 1808 if reporter != nil { 1809 // Report progress through the Drain RPC. 1810 reporter(len(leases), "table leases") 1811 } 1812 } 1813 } 1814 1815 // If create is set, cache and stopper need to be set as well. 1816 func (m *Manager) findTableState(tableID sqlbase.ID, create bool) *tableState { 1817 m.mu.Lock() 1818 defer m.mu.Unlock() 1819 t := m.mu.tables[tableID] 1820 if t == nil && create { 1821 t = &tableState{id: tableID, stopper: m.stopper} 1822 m.mu.tables[tableID] = t 1823 } 1824 return t 1825 } 1826 1827 // RefreshLeases starts a goroutine that refreshes the lease manager 1828 // leases for tables received in the latest system configuration via gossip or 1829 // rangefeeds. This function must be passed a non-nil gossip if 1830 // VersionRangefeedLeases is not active. 1831 func (m *Manager) RefreshLeases( 1832 ctx context.Context, s *stop.Stopper, db *kv.DB, g gossip.DeprecatedGossip, 1833 ) { 1834 s.RunWorker(ctx, func(ctx context.Context) { 1835 m.refreshLeases(ctx, g, db, s) 1836 }) 1837 } 1838 1839 func (m *Manager) refreshLeases( 1840 ctx context.Context, g gossip.DeprecatedGossip, db *kv.DB, s *stop.Stopper, 1841 ) { 1842 tableUpdateCh := make(chan *sqlbase.TableDescriptor) 1843 m.watchForUpdates(ctx, s, db, g, tableUpdateCh) 1844 s.RunWorker(ctx, func(ctx context.Context) { 1845 for { 1846 select { 1847 case table := <-tableUpdateCh: 1848 // NB: We allow nil tables to be sent to synchronize the updating of 1849 // tables. 1850 if table == nil { 1851 continue 1852 } 1853 1854 if evFunc := m.testingKnobs.TestingTableUpdateEvent; evFunc != nil { 1855 if err := evFunc(table); err != nil { 1856 log.Infof(ctx, "skipping table update of %v due to knob: %v", 1857 table, err) 1858 } 1859 } 1860 1861 // Try to refresh the table lease to one >= this version. 1862 log.VEventf(ctx, 2, "purging old version of table %d@%d (offline %v)", 1863 table.ID, table.Version, table.GoingOffline()) 1864 if err := purgeOldVersions( 1865 ctx, db, table.ID, table.GoingOffline(), table.Version, m); err != nil { 1866 log.Warningf(ctx, "error purging leases for table %d(%s): %s", 1867 table.ID, table.Name, err) 1868 } 1869 1870 if evFunc := m.testingKnobs.TestingTableRefreshedEvent; evFunc != nil { 1871 evFunc(table) 1872 } 1873 1874 case <-s.ShouldQuiesce(): 1875 return 1876 } 1877 } 1878 }) 1879 } 1880 1881 // watchForUpdates will watch either gossip or rangefeeds for updates. If the 1882 // version does not currently support rangefeeds, gossip will be used until 1883 // rangefeeds are supported, at which time, the system will shut down the 1884 // gossip listener and start using rangefeeds. 1885 func (m *Manager) watchForUpdates( 1886 ctx context.Context, 1887 s *stop.Stopper, 1888 db *kv.DB, 1889 g gossip.DeprecatedGossip, 1890 tableUpdateCh chan *sqlbase.TableDescriptor, 1891 ) { 1892 useRangefeeds := m.testingKnobs.AlwaysUseRangefeeds || 1893 m.settings.Version.IsActive(ctx, clusterversion.VersionRangefeedLeases) 1894 if useRangefeeds { 1895 m.watchForRangefeedUpdates(ctx, s, db, tableUpdateCh) 1896 return 1897 } 1898 gossipCtx, cancelWatchingGossip := context.WithCancel(ctx) 1899 m.watchForGossipUpdates(gossipCtx, s, g, tableUpdateCh) 1900 canUseRangefeedsCh := m.waitForRangefeedsToBeUsable(ctx, s) 1901 if err := s.RunAsyncTask(ctx, "wait for upgrade", func(ctx context.Context) { 1902 select { 1903 case <-s.ShouldQuiesce(): 1904 return 1905 case <-canUseRangefeedsCh: 1906 // Note: It's okay that the cancelation of gossip watching is 1907 // asynchronous. At worst we'd get duplicate updates or stale updates. 1908 // Both of those are handled. 1909 cancelWatchingGossip() 1910 // Note: It's safe to start watching for rangefeeds now. We know that all 1911 // nodes support rangefeeds in the system config span. Even though there 1912 // may not have been logical ops for all operations in the log, the 1913 // catch-up scan should take us up to the present. 1914 // 1915 // When the rangefeed starts up we'll pass it an initial timestamp which 1916 // is no newer than all updates to the system config span we've already 1917 // seen (see setResolvedTimestamp and its callers). The rangefeed API 1918 // ensures that we will see all updates from on or before that timestamp 1919 // at least once. 1920 m.watchForRangefeedUpdates(ctx, s, db, tableUpdateCh) 1921 } 1922 }); err != nil { 1923 // Note: this can only happen if the stopper has been stopped. 1924 return 1925 } 1926 } 1927 1928 func (m *Manager) watchForGossipUpdates( 1929 ctx context.Context, 1930 s *stop.Stopper, 1931 g gossip.DeprecatedGossip, 1932 tableUpdateCh chan<- *sqlbase.TableDescriptor, 1933 ) { 1934 if _, err := g.OptionalErr(47150); err != nil { 1935 log.Fatalf(ctx, "required gossip until %v is active: %v", clusterversion.VersionRangefeedLeases, err) 1936 } 1937 1938 s.RunWorker(ctx, func(ctx context.Context) { 1939 descKeyPrefix := m.codec.TablePrefix(uint32(sqlbase.DescriptorTable.ID)) 1940 // TODO(ajwerner): Add a mechanism to unregister this channel upon return. 1941 gossipUpdateC := g.DeprecatedRegisterSystemConfigChannel(47150) 1942 filter := gossip.MakeSystemConfigDeltaFilter(descKeyPrefix) 1943 1944 ctx, cancel := s.WithCancelOnQuiesce(ctx) 1945 defer cancel() 1946 for { 1947 select { 1948 case <-gossipUpdateC: 1949 m.handleUpdatedSystemCfg(ctx, g, &filter, tableUpdateCh) 1950 case <-s.ShouldQuiesce(): 1951 return 1952 } 1953 } 1954 }) 1955 } 1956 1957 func (m *Manager) watchForRangefeedUpdates( 1958 ctx context.Context, s *stop.Stopper, db *kv.DB, tableUpdateCh chan<- *sqlbase.TableDescriptor, 1959 ) { 1960 if log.V(1) { 1961 log.Infof(ctx, "using rangefeeds for lease manager updates") 1962 } 1963 distSender := db.NonTransactionalSender().(*kv.CrossRangeTxnWrapperSender).Wrapped().(*kvcoord.DistSender) 1964 eventCh := make(chan *roachpb.RangeFeedEvent) 1965 ctx, _ = s.WithCancelOnQuiesce(ctx) 1966 if err := s.RunAsyncTask(ctx, "lease rangefeed", func(ctx context.Context) { 1967 for { 1968 ts := m.getResolvedTimestamp() 1969 descKeyPrefix := m.codec.TablePrefix(uint32(sqlbase.DescriptorTable.ID)) 1970 span := roachpb.Span{ 1971 Key: descKeyPrefix, 1972 EndKey: descKeyPrefix.PrefixEnd(), 1973 } 1974 // Note: We don't need to use withDiff to detect version changes because 1975 // the Manager already stores the relevant version information. 1976 const withDiff = false 1977 log.VEventf(ctx, 1, "starting rangefeed from %v on %v", ts, span) 1978 err := distSender.RangeFeed(ctx, span, ts, withDiff, eventCh) 1979 if err != nil && ctx.Err() == nil { 1980 log.Warningf(ctx, "lease rangefeed failed, restarting: %v", err) 1981 } 1982 if ctx.Err() != nil { 1983 log.VEventf(ctx, 1, "exiting rangefeed") 1984 return 1985 } 1986 } 1987 }); err != nil { 1988 // This will only fail if the stopper has been stopped. 1989 return 1990 } 1991 handleEvent := func(ev *roachpb.RangeFeedValue) { 1992 if len(ev.Value.RawBytes) == 0 { 1993 return 1994 } 1995 var descriptor sqlbase.Descriptor 1996 if err := ev.Value.GetProto(&descriptor); err != nil { 1997 log.ReportOrPanic(ctx, &m.settings.SV, 1998 "%s: unable to unmarshal descriptor %v", ev.Key, ev.Value) 1999 return 2000 } 2001 table := descriptor.Table(ev.Value.Timestamp) 2002 if table == nil { 2003 return 2004 } 2005 2006 // Note that we don't need to "fill in" the descriptor here. Nobody 2007 // actually reads the table, but it's necessary for the call to 2008 // ValidateTable(). 2009 if err := table.MaybeFillInDescriptor(ctx, nil, m.codec); err != nil { 2010 log.ReportOrPanic(ctx, &m.settings.SV, 2011 "%s: unable to fill in table descriptor %v", ev.Key, table) 2012 return 2013 } 2014 if err := table.ValidateTable(); err != nil { 2015 // Note: we don't ReportOrPanic here because invalid descriptors are 2016 // sometimes created during testing. 2017 log.Errorf(ctx, "%s: received invalid table descriptor: %s. Desc: %v", ev.Key, err, table) 2018 return 2019 } 2020 if log.V(2) { 2021 log.Infof(ctx, "%s: refreshing lease table: %d (%s), version: %d, dropped: %t", 2022 ev.Key, table.ID, table.Name, table.Version, table.Dropped()) 2023 } 2024 select { 2025 case <-ctx.Done(): 2026 case tableUpdateCh <- table: 2027 } 2028 } 2029 s.RunWorker(ctx, func(ctx context.Context) { 2030 for { 2031 select { 2032 case <-ctx.Done(): 2033 return 2034 case e := <-eventCh: 2035 if e.Checkpoint != nil { 2036 log.VEventf(ctx, 2, "got rangefeed checkpoint %v", e.Checkpoint) 2037 m.setResolvedTimestamp(e.Checkpoint.ResolvedTS) 2038 continue 2039 } 2040 if e.Error != nil { 2041 log.Warningf(ctx, "got an error from a rangefeed: %v", e.Error.Error) 2042 continue 2043 } 2044 if e.Val != nil { 2045 handleEvent(e.Val) 2046 } 2047 } 2048 } 2049 }) 2050 } 2051 2052 func (m *Manager) handleUpdatedSystemCfg( 2053 ctx context.Context, 2054 g gossip.DeprecatedGossip, 2055 cfgFilter *gossip.SystemConfigDeltaFilter, 2056 tableUpdateChan chan<- *sqlbase.TableDescriptor, 2057 ) { 2058 cfg := g.DeprecatedSystemConfig(47150) 2059 // Read all tables and their versions 2060 if log.V(2) { 2061 log.Info(ctx, "received a new config; will refresh leases") 2062 } 2063 var latestTimestamp hlc.Timestamp 2064 cfgFilter.ForModified(cfg, func(kv roachpb.KeyValue) { 2065 // Attempt to unmarshal config into a table/database descriptor. 2066 var descriptor sqlbase.Descriptor 2067 if latestTimestamp.Less(kv.Value.Timestamp) { 2068 latestTimestamp = kv.Value.Timestamp 2069 } 2070 if err := kv.Value.GetProto(&descriptor); err != nil { 2071 log.Warningf(ctx, "%s: unable to unmarshal descriptor %v", kv.Key, kv.Value) 2072 return 2073 } 2074 switch union := descriptor.Union.(type) { 2075 case *sqlbase.Descriptor_Table: 2076 table := union.Table 2077 // Note that we don't need to "fill in" the descriptor here. Nobody 2078 // actually reads the table, but it's necessary for the call to 2079 // ValidateTable(). 2080 if err := table.MaybeFillInDescriptor(ctx, nil, m.codec); err != nil { 2081 log.Warningf(ctx, "%s: unable to fill in table descriptor %v", kv.Key, table) 2082 return 2083 } 2084 if err := table.ValidateTable(); err != nil { 2085 log.Errorf(ctx, "%s: received invalid table descriptor: %s. Desc: %v", 2086 kv.Key, err, table, 2087 ) 2088 return 2089 } 2090 if log.V(2) { 2091 log.Infof(ctx, "%s: refreshing lease table: %d (%s), version: %d, dropped: %t", 2092 kv.Key, table.ID, table.Name, table.Version, table.Dropped()) 2093 } 2094 select { 2095 case <-ctx.Done(): 2096 case tableUpdateChan <- table: 2097 } 2098 2099 case *sqlbase.Descriptor_Database: 2100 // Ignore. 2101 } 2102 }) 2103 if !latestTimestamp.IsEmpty() { 2104 m.setResolvedTimestamp(latestTimestamp) 2105 } 2106 // Attempt to shove a nil table descriptor into the channel to ensure that 2107 // we've processed all of the events previously sent. 2108 select { 2109 case <-ctx.Done(): 2110 // If we've been canceled, the other size of the channel will also have 2111 // been canceled. 2112 case tableUpdateChan <- nil: 2113 } 2114 } 2115 2116 // waitForRangefeedsToBeUsable returns a channel which is closed when rangefeeds 2117 // are usable according to the cluster version. 2118 func (m *Manager) waitForRangefeedsToBeUsable(ctx context.Context, s *stop.Stopper) chan struct{} { 2119 // TODO(ajwerner): Add a callback to notify about version changes. 2120 // Checking is pretty cheap but really this should be a callback. 2121 const defaultCheckInterval = 10 * time.Second 2122 checkInterval := defaultCheckInterval 2123 if m.testingKnobs.VersionPollIntervalForRangefeeds != 0 { 2124 checkInterval = m.testingKnobs.VersionPollIntervalForRangefeeds 2125 } 2126 upgradeChan := make(chan struct{}) 2127 timer := timeutil.NewTimer() 2128 timer.Reset(0) 2129 s.RunWorker(ctx, func(ctx context.Context) { 2130 for { 2131 select { 2132 case <-timer.C: 2133 timer.Read = true 2134 if m.settings.Version.IsActive(ctx, clusterversion.VersionRangefeedLeases) { 2135 close(upgradeChan) 2136 return 2137 } 2138 timer.Reset(checkInterval) 2139 case <-ctx.Done(): 2140 return 2141 case <-s.ShouldQuiesce(): 2142 return 2143 } 2144 } 2145 }) 2146 return upgradeChan 2147 } 2148 2149 // setResolvedTimestamp marks the Manager as having processed all updates 2150 // up to this timestamp. It is set under the gossip path based on the highest 2151 // timestamp seen in a system config and under the rangefeed path when a 2152 // resolved timestamp is received. 2153 func (m *Manager) setResolvedTimestamp(ts hlc.Timestamp) { 2154 m.mu.Lock() 2155 defer m.mu.Unlock() 2156 if m.mu.updatesResolvedTimestamp.Less(ts) { 2157 m.mu.updatesResolvedTimestamp = ts 2158 } 2159 } 2160 2161 func (m *Manager) getResolvedTimestamp() hlc.Timestamp { 2162 m.mu.Lock() 2163 defer m.mu.Unlock() 2164 return m.mu.updatesResolvedTimestamp 2165 } 2166 2167 // tableLeaseRefreshLimit is the upper-limit on the number of table leases 2168 // that will continuously have their lease refreshed. 2169 var tableLeaseRefreshLimit = settings.RegisterIntSetting( 2170 "sql.tablecache.lease.refresh_limit", 2171 "maximum number of tables to periodically refresh leases for", 2172 50, 2173 ) 2174 2175 // PeriodicallyRefreshSomeLeases so that leases are fresh and can serve 2176 // traffic immediately. 2177 // TODO(vivek): Remove once epoch based table leases are implemented. 2178 func (m *Manager) PeriodicallyRefreshSomeLeases(ctx context.Context) { 2179 m.stopper.RunWorker(ctx, func(ctx context.Context) { 2180 if m.leaseDuration <= 0 { 2181 return 2182 } 2183 refreshTimer := timeutil.NewTimer() 2184 defer refreshTimer.Stop() 2185 refreshTimer.Reset(m.Storage.jitteredLeaseDuration() / 2) 2186 for { 2187 select { 2188 case <-m.stopper.ShouldQuiesce(): 2189 return 2190 2191 case <-refreshTimer.C: 2192 refreshTimer.Read = true 2193 refreshTimer.Reset(m.Storage.jitteredLeaseDuration() / 2) 2194 2195 m.refreshSomeLeases(ctx) 2196 } 2197 } 2198 }) 2199 } 2200 2201 // Refresh some of the current leases. 2202 func (m *Manager) refreshSomeLeases(ctx context.Context) { 2203 limit := tableLeaseRefreshLimit.Get(&m.settings.SV) 2204 if limit <= 0 { 2205 return 2206 } 2207 // Construct a list of tables needing their leases to be reacquired. 2208 m.mu.Lock() 2209 ids := make([]sqlbase.ID, 0, len(m.mu.tables)) 2210 var i int64 2211 for k, table := range m.mu.tables { 2212 if i++; i > limit { 2213 break 2214 } 2215 table.mu.Lock() 2216 dropped := table.mu.dropped 2217 table.mu.Unlock() 2218 if !dropped { 2219 ids = append(ids, k) 2220 } 2221 } 2222 m.mu.Unlock() 2223 // Limit the number of concurrent lease refreshes. 2224 var wg sync.WaitGroup 2225 for i := range ids { 2226 id := ids[i] 2227 wg.Add(1) 2228 if err := m.stopper.RunLimitedAsyncTask( 2229 ctx, fmt.Sprintf("refresh table:%d lease", id), m.sem, true /*wait*/, func(ctx context.Context) { 2230 defer wg.Done() 2231 if _, err := acquireNodeLease(ctx, m, id); err != nil { 2232 log.Infof(ctx, "refreshing table: %d lease failed: %s", id, err) 2233 } 2234 }); err != nil { 2235 log.Infof(ctx, "didnt refresh table: %d lease: %s", id, err) 2236 wg.Done() 2237 } 2238 } 2239 wg.Wait() 2240 } 2241 2242 // DeleteOrphanedLeases releases all orphaned leases created by a prior 2243 // instance of this node. timeThreshold is a walltime lower than the 2244 // lowest hlc timestamp that the current instance of the node can use. 2245 func (m *Manager) DeleteOrphanedLeases(timeThreshold int64) { 2246 if m.testingKnobs.DisableDeleteOrphanedLeases { 2247 return 2248 } 2249 // TODO(asubiotto): clear up the nodeID naming here and in the table below, 2250 // tracked as https://github.com/cockroachdb/cockroach/issues/48271. 2251 nodeID := m.Storage.nodeIDContainer.SQLInstanceID() 2252 if nodeID == 0 { 2253 panic("zero nodeID") 2254 } 2255 2256 // Run as async worker to prevent blocking the main server Start method. 2257 // Exit after releasing all the orphaned leases. 2258 m.stopper.RunWorker(context.Background(), func(ctx context.Context) { 2259 // This could have been implemented using DELETE WHERE, but DELETE WHERE 2260 // doesn't implement AS OF SYSTEM TIME. 2261 2262 // Read orphaned leases. 2263 sqlQuery := fmt.Sprintf(` 2264 SELECT "descID", version, expiration FROM system.public.lease AS OF SYSTEM TIME %d WHERE "nodeID" = %d 2265 `, timeThreshold, nodeID) 2266 var rows []tree.Datums 2267 retryOptions := base.DefaultRetryOptions() 2268 retryOptions.Closer = m.stopper.ShouldQuiesce() 2269 // The retry is required because of errors caused by node restarts. Retry 30 times. 2270 if err := retry.WithMaxAttempts(ctx, retryOptions, 30, func() error { 2271 var err error 2272 rows, err = m.Storage.internalExecutor.Query( 2273 ctx, "read orphaned table leases", nil /*txn*/, sqlQuery) 2274 return err 2275 }); err != nil { 2276 log.Warningf(ctx, "unable to read orphaned leases: %+v", err) 2277 return 2278 } 2279 2280 var wg sync.WaitGroup 2281 defer wg.Wait() 2282 for i := range rows { 2283 // Early exit? 2284 row := rows[i] 2285 wg.Add(1) 2286 lease := storedTableLease{ 2287 id: sqlbase.ID(tree.MustBeDInt(row[0])), 2288 version: int(tree.MustBeDInt(row[1])), 2289 expiration: tree.MustBeDTimestamp(row[2]), 2290 } 2291 if err := m.stopper.RunLimitedAsyncTask( 2292 ctx, fmt.Sprintf("release table lease %+v", lease), m.sem, true /*wait*/, func(ctx context.Context) { 2293 m.Storage.release(ctx, m.stopper, &lease) 2294 log.Infof(ctx, "released orphaned table lease: %+v", lease) 2295 wg.Done() 2296 }); err != nil { 2297 log.Warningf(ctx, "did not release orphaned table lease: %+v, err = %s", lease, err) 2298 wg.Done() 2299 } 2300 } 2301 }) 2302 } 2303 2304 // DB returns the Manager's handle to a kv.DB. 2305 func (m *Manager) DB() *kv.DB { 2306 return m.db 2307 } 2308 2309 // Codec return the Manager's SQLCodec. 2310 func (m *Manager) Codec() keys.SQLCodec { 2311 return m.codec 2312 } 2313 2314 // VisitLeases introspects the state of leases managed by the Manager. 2315 // 2316 // TODO(ajwerner): consider refactoring the function to take a struct, maybe 2317 // called LeaseInfo. 2318 func (m *Manager) VisitLeases( 2319 f func(desc sqlbase.TableDescriptor, dropped bool, refCount int, expiration tree.DTimestamp) (wantMore bool), 2320 ) { 2321 m.mu.Lock() 2322 defer m.mu.Unlock() 2323 for _, ts := range m.mu.tables { 2324 tableVisitor := func() (wantMore bool) { 2325 ts.mu.Lock() 2326 defer ts.mu.Unlock() 2327 2328 dropped := ts.mu.dropped 2329 2330 for _, state := range ts.mu.active.data { 2331 state.mu.Lock() 2332 lease := state.mu.lease 2333 refCount := state.mu.refcount 2334 state.mu.Unlock() 2335 2336 if lease == nil { 2337 continue 2338 } 2339 2340 if !f(state.TableDescriptor, dropped, refCount, lease.expiration) { 2341 return false 2342 } 2343 } 2344 return true 2345 } 2346 if !tableVisitor() { 2347 return 2348 } 2349 } 2350 } 2351 2352 // TestingAcquireAndAssertMinVersion acquires a read lease for the specified 2353 // table ID. The lease is grabbed on the latest version if >= specified version. 2354 // It returns a table descriptor and an expiration time valid for the timestamp. 2355 // This method is useful for testing and is only intended to be used in that 2356 // context. 2357 func (m *Manager) TestingAcquireAndAssertMinVersion( 2358 ctx context.Context, 2359 timestamp hlc.Timestamp, 2360 tableID sqlbase.ID, 2361 minVersion sqlbase.DescriptorVersion, 2362 ) (*sqlbase.ImmutableTableDescriptor, hlc.Timestamp, error) { 2363 t := m.findTableState(tableID, true) 2364 if err := ensureVersion(ctx, tableID, minVersion, m); err != nil { 2365 return nil, hlc.Timestamp{}, err 2366 } 2367 table, _, err := t.findForTimestamp(ctx, timestamp) 2368 if err != nil { 2369 return nil, hlc.Timestamp{}, err 2370 } 2371 return &table.ImmutableTableDescriptor, table.expiration, nil 2372 }