github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_range_lease.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 // This file contains replica methods related to range leases. 12 // 13 // Here be dragons: The lease system (especially for epoch-based 14 // leases) relies on multiple interlocking conditional puts (here and 15 // in NodeLiveness). Reads (to get expected values) and conditional 16 // puts have to happen in a certain order, leading to surprising 17 // dependencies at a distance (for example, there's a LeaseStatus 18 // object that gets plumbed most of the way through this file. 19 // LeaseStatus bundles the results of multiple checks with the time at 20 // which they were performed, so that timestamp must be used for later 21 // operations). The current arrangement is not perfect, and some 22 // opportunities for improvement appear, but any changes must be made 23 // very carefully. 24 // 25 // NOTE(bdarnell): The biggest problem with the current code is that 26 // with epoch-based leases, we may do two separate slow operations 27 // (IncrementEpoch/Heartbeat and RequestLease/AdminTransferLease). In 28 // the organization that was inherited from expiration-based leases, 29 // we prepare the arguments we're going to use for the lease 30 // operations before performing the liveness operations, and by the 31 // time the liveness operations complete those may be stale. 32 // 33 // Therefore, my suggested refactoring would be to move the liveness 34 // operations earlier in the process, soon after the initial 35 // leaseStatus call. If a liveness operation is required, do it and 36 // start over, with a fresh leaseStatus. 37 // 38 // This could also allow the liveness operations to be coalesced per 39 // node instead of having each range separately queue up redundant 40 // liveness operations. (The InitOrJoin model predates the 41 // singleflight package; could we simplify things by using it?) 42 43 package kvserver 44 45 import ( 46 "context" 47 "fmt" 48 "time" 49 50 "github.com/cockroachdb/cockroach/pkg/base" 51 "github.com/cockroachdb/cockroach/pkg/keys" 52 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 53 "github.com/cockroachdb/cockroach/pkg/roachpb" 54 "github.com/cockroachdb/cockroach/pkg/util/hlc" 55 "github.com/cockroachdb/cockroach/pkg/util/log" 56 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 57 "github.com/cockroachdb/cockroach/pkg/util/tracing" 58 "github.com/cockroachdb/errors" 59 "github.com/cockroachdb/logtags" 60 "github.com/opentracing/opentracing-go" 61 ) 62 63 var leaseStatusLogLimiter = log.Every(5 * time.Second) 64 65 // leaseRequestHandle is a handle to an asynchronous lease request. 66 type leaseRequestHandle struct { 67 p *pendingLeaseRequest 68 c chan *roachpb.Error 69 } 70 71 // C returns the channel where the lease request's result will be sent on. 72 func (h *leaseRequestHandle) C() <-chan *roachpb.Error { 73 if h.c == nil { 74 panic("handle already canceled") 75 } 76 return h.c 77 } 78 79 // Cancel cancels the request handle. It also cancels the asynchronous 80 // lease request task if its reference count drops to zero. 81 func (h *leaseRequestHandle) Cancel() { 82 h.p.repl.mu.Lock() 83 defer h.p.repl.mu.Unlock() 84 if len(h.c) == 0 { 85 // Our lease request is ongoing... 86 // Unregister handle. 87 delete(h.p.llHandles, h) 88 // Cancel request, if necessary. 89 if len(h.p.llHandles) == 0 { 90 h.p.cancelLocked() 91 } 92 } 93 // Mark handle as canceled. 94 h.c = nil 95 } 96 97 // resolve notifies the handle of the request's result. 98 // 99 // Requires repl.mu is exclusively locked. 100 func (h *leaseRequestHandle) resolve(pErr *roachpb.Error) { h.c <- pErr } 101 102 // pendingLeaseRequest coalesces RequestLease requests and lets 103 // callers join an in-progress lease request and wait for the result. 104 // The actual execution of the RequestLease Raft request is delegated 105 // to a replica. 106 // 107 // There are two types of leases: expiration-based and epoch-based. 108 // Expiration-based leases are considered valid as long as the wall 109 // time is less than the lease expiration timestamp minus the maximum 110 // clock offset. Epoch-based leases do not expire, but rely on the 111 // leaseholder maintaining its node liveness record (also a lease, but 112 // at the node level). All ranges up to and including the node 113 // liveness table must use expiration-based leases to avoid any 114 // circular dependencies. 115 // 116 // Methods are not thread-safe; a pendingLeaseRequest is logically part 117 // of the replica it references, so replica.mu should be used to 118 // synchronize all calls. 119 type pendingLeaseRequest struct { 120 // The replica that the pendingLeaseRequest is a part of. 121 repl *Replica 122 // Set of request handles attached to the lease acquisition. 123 // All accesses require repl.mu to be exclusively locked. 124 llHandles map[*leaseRequestHandle]struct{} 125 // cancelLocked is a context cancellation function for the async lease 126 // request, if one exists. It cancels an ongoing lease request and cleans up 127 // the requests state, including setting the cancelLocked function itself to 128 // nil. It will be called when a lease request is canceled because all 129 // handles cancel or when a lease request completes. If nil, then no request 130 // is in progress. repl.mu to be exclusively locked to call the function. 131 cancelLocked func() 132 // nextLease is the pending RequestLease request, if any. It can be used to 133 // figure out if we're in the process of extending our own lease, or 134 // transferring it to another replica. 135 nextLease roachpb.Lease 136 } 137 138 func makePendingLeaseRequest(repl *Replica) pendingLeaseRequest { 139 return pendingLeaseRequest{ 140 repl: repl, 141 llHandles: make(map[*leaseRequestHandle]struct{}), 142 } 143 } 144 145 // RequestPending returns the pending Lease, if one is in progress. 146 // The second return val is true if a lease request is pending. 147 // 148 // Requires repl.mu is read locked. 149 func (p *pendingLeaseRequest) RequestPending() (roachpb.Lease, bool) { 150 pending := p.cancelLocked != nil 151 if pending { 152 return p.nextLease, true 153 } 154 return roachpb.Lease{}, false 155 } 156 157 // InitOrJoinRequest executes a RequestLease command asynchronously and returns a 158 // handle on which the result will be posted. If there's already a request in 159 // progress, we join in waiting for the results of that request. 160 // It is an error to call InitOrJoinRequest() while a request is in progress 161 // naming another replica as lease holder. 162 // 163 // replica is used to schedule and execute async work (proposing a RequestLease 164 // command). replica.mu is locked when delivering results, so calls from the 165 // replica happen either before or after a result for a pending request has 166 // happened. 167 // 168 // The new lease will be a successor to the one in the status 169 // argument, and its fields will be used to fill in the expected 170 // values for liveness and lease operations. 171 // 172 // transfer needs to be set if the request represents a lease transfer (as 173 // opposed to an extension, or acquiring the lease when none is held). 174 // 175 // Requires repl.mu is exclusively locked. 176 func (p *pendingLeaseRequest) InitOrJoinRequest( 177 ctx context.Context, 178 nextLeaseHolder roachpb.ReplicaDescriptor, 179 status kvserverpb.LeaseStatus, 180 startKey roachpb.Key, 181 transfer bool, 182 ) *leaseRequestHandle { 183 if nextLease, ok := p.RequestPending(); ok { 184 if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { 185 // Join a pending request asking for the same replica to become lease 186 // holder. 187 return p.JoinRequest() 188 } 189 190 // We can't join the request in progress. 191 // TODO(nvanbenschoten): should this return a LeaseRejectedError? Should 192 // it cancel and replace the request in progress? Reconsider. 193 return p.newResolvedHandle(roachpb.NewErrorf( 194 "request for different replica in progress (requesting: %+v, in progress: %+v)", 195 nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID)) 196 } 197 198 // No request in progress. Let's propose a Lease command asynchronously. 199 llHandle := p.newHandle() 200 reqHeader := roachpb.RequestHeader{ 201 Key: startKey, 202 } 203 var leaseReq roachpb.Request 204 now := p.repl.store.Clock().Now() 205 reqLease := roachpb.Lease{ 206 // It's up to us to ensure that Lease.Start is greater than the 207 // end time of the previous lease. This means that if status 208 // refers to an expired epoch lease, we must increment the epoch 209 // *at status.Timestamp* before we can propose this lease. 210 // 211 // Note that the server may decrease our proposed start time if it 212 // decides that it is safe to do so (for example, this happens 213 // when renewing an expiration-based lease), but it will never 214 // increase it (and a start timestamp that is too low is unsafe 215 // because it results in incorrect initialization of the timestamp 216 // cache on the new leaseholder). 217 Start: status.Timestamp, 218 Replica: nextLeaseHolder, 219 ProposedTS: &now, 220 } 221 222 if p.repl.requiresExpiringLeaseRLocked() { 223 reqLease.Expiration = &hlc.Timestamp{} 224 *reqLease.Expiration = status.Timestamp.Add(int64(p.repl.store.cfg.RangeLeaseActiveDuration()), 0) 225 } else { 226 // Get the liveness for the next lease holder and set the epoch in the lease request. 227 liveness, err := p.repl.store.cfg.NodeLiveness.GetLiveness(nextLeaseHolder.NodeID) 228 if err != nil { 229 llHandle.resolve(roachpb.NewError(&roachpb.LeaseRejectedError{ 230 Existing: status.Lease, 231 Requested: reqLease, 232 Message: fmt.Sprintf("couldn't request lease for %+v: %v", nextLeaseHolder, err), 233 })) 234 return llHandle 235 } 236 reqLease.Epoch = liveness.Epoch 237 } 238 239 if transfer { 240 leaseReq = &roachpb.TransferLeaseRequest{ 241 RequestHeader: reqHeader, 242 Lease: reqLease, 243 PrevLease: status.Lease, 244 } 245 } else { 246 minProposedTS := p.repl.mu.minLeaseProposedTS 247 leaseReq = &roachpb.RequestLeaseRequest{ 248 RequestHeader: reqHeader, 249 Lease: reqLease, 250 // PrevLease must match for our lease to be accepted. If another 251 // lease is applied between our previous call to leaseStatus and 252 // our lease request applying, it will be rejected. 253 PrevLease: status.Lease, 254 MinProposedTS: &minProposedTS, 255 } 256 } 257 258 if err := p.requestLeaseAsync(ctx, nextLeaseHolder, reqLease, status, leaseReq); err != nil { 259 // We failed to start the asynchronous task. Send a blank NotLeaseHolderError 260 // back to indicate that we have no idea who the range lease holder might 261 // be; we've withdrawn from active duty. 262 llHandle.resolve(roachpb.NewError( 263 newNotLeaseHolderError(nil, p.repl.store.StoreID(), p.repl.mu.state.Desc))) 264 return llHandle 265 } 266 // InitOrJoinRequest requires that repl.mu is exclusively locked. requestLeaseAsync 267 // also requires this lock to send results on all waiter channels. This means that 268 // no results will be sent until we've release the lock, so there's no race between 269 // adding our new channel to p.llHandles below and requestLeaseAsync sending results 270 // on all channels in p.llHandles. The same logic applies to p.nextLease. 271 p.llHandles[llHandle] = struct{}{} 272 p.nextLease = reqLease 273 return llHandle 274 } 275 276 // requestLeaseAsync sends a transfer lease or lease request to the 277 // specified replica. The request is sent in an async task. 278 // 279 // The status argument is used as the expected value for liveness operations. 280 // reqLease and leaseReq must be consistent with the LeaseStatus. 281 func (p *pendingLeaseRequest) requestLeaseAsync( 282 parentCtx context.Context, 283 nextLeaseHolder roachpb.ReplicaDescriptor, 284 reqLease roachpb.Lease, 285 status kvserverpb.LeaseStatus, 286 leaseReq roachpb.Request, 287 ) error { 288 const opName = "request range lease" 289 var sp opentracing.Span 290 tr := p.repl.AmbientContext.Tracer 291 if parentSp := opentracing.SpanFromContext(parentCtx); parentSp != nil { 292 // We use FollowsFrom because the lease request's span can outlive the 293 // parent request. This is possible if parentCtx is canceled after others 294 // have coalesced on to this lease request (see leaseRequestHandle.Cancel). 295 // TODO(andrei): we should use Tracer.StartChildSpan() for efficiency, 296 // except that one does not currently support FollowsFrom relationships. 297 sp = tr.StartSpan( 298 opName, 299 opentracing.FollowsFrom(parentSp.Context()), 300 tracing.LogTagsFromCtx(parentCtx), 301 ) 302 } else { 303 sp = tr.(*tracing.Tracer).StartRootSpan( 304 opName, logtags.FromContext(parentCtx), tracing.NonRecordableSpan) 305 } 306 307 // Create a new context *without* a timeout. Instead, we multiplex the 308 // cancellation of all contexts onto this new one, only canceling it if all 309 // coalesced requests timeout/cancel. p.cancelLocked (defined below) is the 310 // cancel function that must be called; calling just cancel is insufficient. 311 ctx := p.repl.AnnotateCtx(context.Background()) 312 ctx = opentracing.ContextWithSpan(ctx, sp) 313 ctx, cancel := context.WithCancel(ctx) 314 315 // Make sure we clean up the context and request state. This will be called 316 // either when the request completes cleanly or when it is terminated early. 317 p.cancelLocked = func() { 318 cancel() 319 p.cancelLocked = nil 320 p.nextLease = roachpb.Lease{} 321 } 322 323 err := p.repl.store.Stopper().RunAsyncTask( 324 ctx, "storage.pendingLeaseRequest: requesting lease", func(ctx context.Context) { 325 defer sp.Finish() 326 327 // If requesting an epoch-based lease & current state is expired, 328 // potentially heartbeat our own liveness or increment epoch of 329 // prior owner. Note we only do this if the previous lease was 330 // epoch-based. 331 var pErr *roachpb.Error 332 if reqLease.Type() == roachpb.LeaseEpoch && status.State == kvserverpb.LeaseState_EXPIRED && 333 status.Lease.Type() == roachpb.LeaseEpoch { 334 var err error 335 // If this replica is previous & next lease holder, manually heartbeat to become live. 336 if status.Lease.OwnedBy(nextLeaseHolder.StoreID) && 337 p.repl.store.StoreID() == nextLeaseHolder.StoreID { 338 if err = p.repl.store.cfg.NodeLiveness.Heartbeat(ctx, status.Liveness); err != nil { 339 log.Errorf(ctx, "%v", err) 340 } 341 } else if status.Liveness.Epoch == status.Lease.Epoch { 342 // If not owner, increment epoch if necessary to invalidate lease. 343 // However, we only do so in the event that the next leaseholder is 344 // considered live at this time. If not, there's no sense in 345 // incrementing the expired leaseholder's epoch. 346 if live, liveErr := p.repl.store.cfg.NodeLiveness.IsLive(nextLeaseHolder.NodeID); !live || liveErr != nil { 347 err = errors.Errorf("not incrementing epoch on n%d because next leaseholder (n%d) not live (err = %v)", 348 status.Liveness.NodeID, nextLeaseHolder.NodeID, liveErr) 349 if log.V(1) { 350 log.Infof(ctx, "%v", err) 351 } 352 } else if err = p.repl.store.cfg.NodeLiveness.IncrementEpoch(ctx, status.Liveness); err != nil { 353 // If we get ErrEpochAlreadyIncremented, someone else beat 354 // us to it. This proves that the target node is truly 355 // dead *now*, but it doesn't prove that it was dead at 356 // status.Timestamp (which we've encoded into our lease 357 // request). It's possible that the node was temporarily 358 // considered dead but revived without having its epoch 359 // incremented, i.e. that it was in fact live at 360 // status.Timestamp. 361 // 362 // It would be incorrect to simply proceed to sending our 363 // lease request since our lease.Start may precede the 364 // effective end timestamp of the predecessor lease (the 365 // expiration of the last successful heartbeat before the 366 // epoch increment), and so under this lease this node's 367 // timestamp cache would not necessarily reflect all reads 368 // served by the prior leaseholder. 369 // 370 // It would be correct to bump the timestamp in the lease 371 // request and proceed, but that just sets up another race 372 // between this node and the one that already incremented 373 // the epoch. They're probably going to beat us this time 374 // too, so just return the NotLeaseHolderError here 375 // instead of trying to fix up the timestamps and submit 376 // the lease request. 377 // 378 // ErrEpochAlreadyIncremented is not an unusual situation, 379 // so we don't log it as an error. 380 // 381 // https://github.com/cockroachdb/cockroach/issues/35986 382 if !errors.Is(err, ErrEpochAlreadyIncremented) { 383 log.Errorf(ctx, "%v", err) 384 } 385 } 386 } 387 // Set error for propagation to all waiters below. 388 if err != nil { 389 // TODO(bdarnell): is status.Lease really what we want to put in the NotLeaseHolderError here? 390 pErr = roachpb.NewError(newNotLeaseHolderError(&status.Lease, p.repl.store.StoreID(), p.repl.Desc())) 391 } 392 } 393 394 // Send the RequestLeaseRequest or TransferLeaseRequest and wait for the new 395 // lease to be applied. 396 if pErr == nil { 397 ba := roachpb.BatchRequest{} 398 ba.Timestamp = p.repl.store.Clock().Now() 399 ba.RangeID = p.repl.RangeID 400 ba.Add(leaseReq) 401 _, pErr = p.repl.Send(ctx, ba) 402 } 403 // We reset our state below regardless of whether we've gotten an error or 404 // not, but note that an error is ambiguous - there's no guarantee that the 405 // transfer will not still apply. That's OK, however, as the "in transfer" 406 // state maintained by the pendingLeaseRequest is not relied on for 407 // correctness (see repl.mu.minLeaseProposedTS), and resetting the state 408 // is beneficial as it'll allow the replica to attempt to transfer again or 409 // extend the existing lease in the future. 410 411 p.repl.mu.Lock() 412 defer p.repl.mu.Unlock() 413 if ctx.Err() != nil { 414 // We were canceled and this request was already cleaned up 415 // under lock. At this point, another async request could be 416 // active so we don't want to do anything else. 417 return 418 } 419 420 // Send result of lease to all waiter channels and cleanup request. 421 for llHandle := range p.llHandles { 422 // Don't send the same transaction object twice; this can lead to races. 423 if pErr != nil { 424 pErrClone := *pErr 425 pErrClone.SetTxn(pErr.GetTxn()) 426 llHandle.resolve(&pErrClone) 427 } else { 428 llHandle.resolve(nil) 429 } 430 delete(p.llHandles, llHandle) 431 } 432 p.cancelLocked() 433 }) 434 if err != nil { 435 p.cancelLocked() 436 sp.Finish() 437 return err 438 } 439 return nil 440 } 441 442 // JoinRequest adds one more waiter to the currently pending request. 443 // It is the caller's responsibility to ensure that there is a pending request, 444 // and that the request is compatible with whatever the caller is currently 445 // wanting to do (i.e. the request is naming the intended node as the next 446 // lease holder). 447 // 448 // Requires repl.mu is exclusively locked. 449 func (p *pendingLeaseRequest) JoinRequest() *leaseRequestHandle { 450 llHandle := p.newHandle() 451 if _, ok := p.RequestPending(); !ok { 452 llHandle.resolve(roachpb.NewErrorf("no request in progress")) 453 return llHandle 454 } 455 p.llHandles[llHandle] = struct{}{} 456 return llHandle 457 } 458 459 // TransferInProgress returns the next lease, if the replica is in the process 460 // of transferring away its range lease. This next lease indicates the next 461 // lease holder. The second return val is true if a transfer is in progress. 462 // Note that the return values are best-effort and shouldn't be relied upon for 463 // correctness: if a previous transfer has returned an error, TransferInProgress 464 // will return `false`, but that doesn't necessarily mean that the transfer 465 // cannot still apply (see replica.mu.minLeaseProposedTS). 466 // 467 // It is assumed that the replica owning this pendingLeaseRequest owns the 468 // LeaderLease. 469 // 470 // replicaID is the ID of the parent replica. 471 // 472 // Requires repl.mu is read locked. 473 func (p *pendingLeaseRequest) TransferInProgress( 474 replicaID roachpb.ReplicaID, 475 ) (roachpb.Lease, bool) { 476 if nextLease, ok := p.RequestPending(); ok { 477 // Is the lease being transferred? (as opposed to just extended) 478 if replicaID != nextLease.Replica.ReplicaID { 479 return nextLease, true 480 } 481 } 482 return roachpb.Lease{}, false 483 } 484 485 // newHandle creates a new leaseRequestHandle referencing the pending lease 486 // request. 487 func (p *pendingLeaseRequest) newHandle() *leaseRequestHandle { 488 return &leaseRequestHandle{ 489 p: p, 490 c: make(chan *roachpb.Error, 1), 491 } 492 } 493 494 // newResolvedHandle creates a new leaseRequestHandle referencing the pending 495 // lease request. It then resolves the handle with the provided error. 496 func (p *pendingLeaseRequest) newResolvedHandle(pErr *roachpb.Error) *leaseRequestHandle { 497 h := p.newHandle() 498 h.resolve(pErr) 499 return h 500 } 501 502 // leaseStatus returns lease status. If the lease is epoch-based, 503 // the liveness field will be set to the liveness used to compute 504 // its state, unless state == leaseError. 505 // 506 // - The lease is considered valid if the timestamp is covered by the 507 // supplied lease. This is determined differently depending on the 508 // lease properties. For expiration-based leases, the timestamp is 509 // covered if it's less than the expiration (minus the maximum 510 // clock offset). For epoch-based "node liveness" leases, the lease 511 // epoch must match the owner node's liveness epoch -AND- the 512 // timestamp must be within the node's liveness expiration (also 513 // minus the maximum clock offset). 514 // 515 // To be valid, a lease which contains a valid ProposedTS must have 516 // a proposed timestamp greater than the minimum proposed timestamp, 517 // which prevents a restarted process from serving commands, since 518 // the spanlatch manager has been wiped through the restart. 519 // 520 // - The lease is considered in stasis if the timestamp is within the 521 // maximum clock offset window of the lease expiration. 522 // 523 // - The lease is considered expired in all other cases. 524 // 525 // The maximum clock offset must always be taken into consideration to 526 // avoid a failure of linearizability on a single register during 527 // lease changes. Without that stasis period, the following could 528 // occur: 529 // 530 // * a range lease gets committed on the new lease holder (but not the old). 531 // * client proposes and commits a write on new lease holder (with a 532 // timestamp just greater than the expiration of the old lease). 533 // * client tries to read what it wrote, but hits a slow coordinator 534 // (which assigns a timestamp covered by the old lease). 535 // * the read is served by the old lease holder (which has not 536 // processed the change in lease holdership). 537 // * the client fails to read their own write. 538 func (r *Replica) leaseStatus( 539 lease roachpb.Lease, timestamp, minProposedTS hlc.Timestamp, 540 ) kvserverpb.LeaseStatus { 541 status := kvserverpb.LeaseStatus{Timestamp: timestamp, Lease: lease} 542 var expiration hlc.Timestamp 543 if lease.Type() == roachpb.LeaseExpiration { 544 expiration = lease.GetExpiration() 545 } else { 546 var err error 547 status.Liveness, err = r.store.cfg.NodeLiveness.GetLiveness(lease.Replica.NodeID) 548 if err != nil || status.Liveness.Epoch < lease.Epoch { 549 // If lease validity can't be determined (e.g. gossip is down 550 // and liveness info isn't available for owner), we can neither 551 // use the lease nor do we want to attempt to acquire it. 552 if err != nil { 553 if leaseStatusLogLimiter.ShouldLog() { 554 log.Warningf(context.TODO(), "can't determine lease status due to node liveness error: %+v", err) 555 } 556 } 557 status.State = kvserverpb.LeaseState_ERROR 558 return status 559 } 560 if status.Liveness.Epoch > lease.Epoch { 561 status.State = kvserverpb.LeaseState_EXPIRED 562 return status 563 } 564 expiration = hlc.Timestamp(status.Liveness.Expiration) 565 } 566 maxOffset := r.store.Clock().MaxOffset() 567 stasis := expiration.Add(-int64(maxOffset), 0) 568 if timestamp.Less(stasis) { 569 status.State = kvserverpb.LeaseState_VALID 570 // If the replica owns the lease, additional verify that the lease's 571 // proposed timestamp is not earlier than the min proposed timestamp. 572 if lease.Replica.StoreID == r.store.StoreID() && 573 lease.ProposedTS != nil && lease.ProposedTS.Less(minProposedTS) { 574 status.State = kvserverpb.LeaseState_PROSCRIBED 575 } 576 } else if timestamp.Less(expiration) { 577 status.State = kvserverpb.LeaseState_STASIS 578 } else { 579 status.State = kvserverpb.LeaseState_EXPIRED 580 } 581 return status 582 } 583 584 // requiresExpiringLeaseRLocked returns whether this range uses an 585 // expiration-based lease; false if epoch-based. Ranges located before or 586 // including the node liveness table must use expiration leases to avoid 587 // circular dependencies on the node liveness table. 588 func (r *Replica) requiresExpiringLeaseRLocked() bool { 589 return r.store.cfg.NodeLiveness == nil || !r.store.cfg.EnableEpochRangeLeases || 590 r.mu.state.Desc.StartKey.Less(roachpb.RKey(keys.NodeLivenessKeyMax)) 591 } 592 593 // requestLeaseLocked executes a request to obtain or extend a lease 594 // asynchronously and returns a channel on which the result will be posted. If 595 // there's already a request in progress, we join in waiting for the results of 596 // that request. Unless an error is returned, the obtained lease will be valid 597 // for a time interval containing the requested timestamp. 598 // If a transfer is in progress, a NotLeaseHolderError directing to the recipient is 599 // sent on the returned chan. 600 func (r *Replica) requestLeaseLocked( 601 ctx context.Context, status kvserverpb.LeaseStatus, 602 ) *leaseRequestHandle { 603 if r.store.TestingKnobs().LeaseRequestEvent != nil { 604 r.store.TestingKnobs().LeaseRequestEvent(status.Timestamp) 605 } 606 // Propose a Raft command to get a lease for this replica. 607 repDesc, err := r.getReplicaDescriptorRLocked() 608 if err != nil { 609 return r.mu.pendingLeaseRequest.newResolvedHandle(roachpb.NewError(err)) 610 } 611 if transferLease, ok := r.mu.pendingLeaseRequest.TransferInProgress(repDesc.ReplicaID); ok { 612 return r.mu.pendingLeaseRequest.newResolvedHandle(roachpb.NewError( 613 newNotLeaseHolderError(&transferLease, r.store.StoreID(), r.mu.state.Desc))) 614 } 615 if r.store.IsDraining() { 616 // We've retired from active duty. 617 return r.mu.pendingLeaseRequest.newResolvedHandle(roachpb.NewError( 618 newNotLeaseHolderError(nil, r.store.StoreID(), r.mu.state.Desc))) 619 } 620 return r.mu.pendingLeaseRequest.InitOrJoinRequest( 621 ctx, repDesc, status, r.mu.state.Desc.StartKey.AsRawKey(), false /* transfer */) 622 } 623 624 // AdminTransferLease transfers the LeaderLease to another replica. A 625 // valid LeaseStatus must be supplied. Only the current holder of the 626 // LeaderLease can do a transfer, because it needs to stop serving 627 // reads and proposing Raft commands (CPut is a read) after sending 628 // the transfer command. If it did not stop serving reads immediately, 629 // it would potentially serve reads with timestamps greater than the 630 // start timestamp of the new (transferred) lease. More subtly, the 631 // replica can't even serve reads or propose commands with timestamps 632 // lower than the start of the new lease because it could lead to read 633 // your own write violations (see comments on the stasis period in 634 // IsLeaseValid). We could, in principle, serve reads more than the 635 // maximum clock offset in the past. 636 // 637 // The method waits for any in-progress lease extension to be done, and it also 638 // blocks until the transfer is done. If a transfer is already in progress, 639 // this method joins in waiting for it to complete if it's transferring to the 640 // same replica. Otherwise, a NotLeaseHolderError is returned. 641 func (r *Replica) AdminTransferLease(ctx context.Context, target roachpb.StoreID) error { 642 // initTransferHelper inits a transfer if no extension is in progress. 643 // It returns a channel for waiting for the result of a pending 644 // extension (if any is in progress) and a channel for waiting for the 645 // transfer (if it was successfully initiated). 646 var nextLeaseHolder roachpb.ReplicaDescriptor 647 initTransferHelper := func() (extension, transfer *leaseRequestHandle, err error) { 648 r.mu.Lock() 649 defer r.mu.Unlock() 650 651 status := r.leaseStatus(*r.mu.state.Lease, r.store.Clock().Now(), r.mu.minLeaseProposedTS) 652 if status.Lease.OwnedBy(target) { 653 // The target is already the lease holder. Nothing to do. 654 return nil, nil, nil 655 } 656 desc := r.mu.state.Desc 657 if !status.Lease.OwnedBy(r.store.StoreID()) { 658 return nil, nil, newNotLeaseHolderError(&status.Lease, r.store.StoreID(), desc) 659 } 660 // Verify the target is a replica of the range. 661 var ok bool 662 if nextLeaseHolder, ok = desc.GetReplicaDescriptor(target); !ok { 663 return nil, nil, errors.Errorf("unable to find store %d in range %+v", target, desc) 664 } 665 666 // For now, don't allow replicas of type LEARNER to be leaseholders, see 667 // comments in RequestLease and TransferLease for why. 668 // 669 // TODO(dan): We shouldn't need this, the checks in RequestLease and 670 // TransferLease are the canonical ones and should be sufficient. Sadly, the 671 // `r.mu.minLeaseProposedTS = status.Timestamp` line below will likely play 672 // badly with that. This would be an issue even without learners, but 673 // omitting this check would make it worse. Fixme. 674 if t := nextLeaseHolder.GetType(); t != roachpb.VOTER_FULL { 675 return nil, nil, errors.Errorf(`cannot transfer lease to replica of type %s`, t) 676 } 677 678 if nextLease, ok := r.mu.pendingLeaseRequest.RequestPending(); ok && 679 nextLease.Replica != nextLeaseHolder { 680 repDesc, err := r.getReplicaDescriptorRLocked() 681 if err != nil { 682 return nil, nil, err 683 } 684 if nextLease.Replica == repDesc { 685 // There's an extension in progress. Let's wait for it to succeed and 686 // try again. 687 return r.mu.pendingLeaseRequest.JoinRequest(), nil, nil 688 } 689 // Another transfer is in progress, and it's not transferring to the 690 // same replica we'd like. 691 return nil, nil, newNotLeaseHolderError(&nextLease, r.store.StoreID(), desc) 692 } 693 // Stop using the current lease. 694 r.mu.minLeaseProposedTS = status.Timestamp 695 transfer = r.mu.pendingLeaseRequest.InitOrJoinRequest( 696 ctx, nextLeaseHolder, status, desc.StartKey.AsRawKey(), true, /* transfer */ 697 ) 698 return nil, transfer, nil 699 } 700 701 // Loop while there's an extension in progress. 702 for { 703 // See if there's an extension in progress that we have to wait for. 704 // If there isn't, request a transfer. 705 extension, transfer, err := initTransferHelper() 706 if err != nil { 707 return err 708 } 709 if extension == nil { 710 if transfer == nil { 711 // The target is us and we're the lease holder. 712 return nil 713 } 714 select { 715 case pErr := <-transfer.C(): 716 return pErr.GoError() 717 case <-ctx.Done(): 718 transfer.Cancel() 719 return ctx.Err() 720 } 721 } 722 // Wait for the in-progress extension without holding the mutex. 723 if r.store.TestingKnobs().LeaseTransferBlockedOnExtensionEvent != nil { 724 r.store.TestingKnobs().LeaseTransferBlockedOnExtensionEvent(nextLeaseHolder) 725 } 726 select { 727 case <-extension.C(): 728 continue 729 case <-ctx.Done(): 730 extension.Cancel() 731 return ctx.Err() 732 } 733 } 734 } 735 736 // GetLease returns the lease and, if available, the proposed next lease. 737 func (r *Replica) GetLease() (roachpb.Lease, roachpb.Lease) { 738 r.mu.RLock() 739 defer r.mu.RUnlock() 740 return r.getLeaseRLocked() 741 } 742 743 func (r *Replica) getLeaseRLocked() (roachpb.Lease, roachpb.Lease) { 744 if nextLease, ok := r.mu.pendingLeaseRequest.RequestPending(); ok { 745 return *r.mu.state.Lease, nextLease 746 } 747 return *r.mu.state.Lease, roachpb.Lease{} 748 } 749 750 // OwnsValidLease returns whether this replica is the current valid 751 // leaseholder. Note that this method does not check to see if a transfer is 752 // pending, but returns the status of the current lease and ownership at the 753 // specified point in time. 754 func (r *Replica) OwnsValidLease(ts hlc.Timestamp) bool { 755 r.mu.RLock() 756 defer r.mu.RUnlock() 757 return r.ownsValidLeaseRLocked(ts) 758 } 759 760 func (r *Replica) ownsValidLeaseRLocked(ts hlc.Timestamp) bool { 761 return r.mu.state.Lease.OwnedBy(r.store.StoreID()) && 762 r.leaseStatus(*r.mu.state.Lease, ts, r.mu.minLeaseProposedTS).State == kvserverpb.LeaseState_VALID 763 } 764 765 // IsLeaseValid returns true if the replica's lease is owned by this 766 // replica and is valid (not expired, not in stasis). 767 func (r *Replica) IsLeaseValid(lease roachpb.Lease, ts hlc.Timestamp) bool { 768 r.mu.RLock() 769 defer r.mu.RUnlock() 770 return r.isLeaseValidRLocked(lease, ts) 771 } 772 773 func (r *Replica) isLeaseValidRLocked(lease roachpb.Lease, ts hlc.Timestamp) bool { 774 return r.leaseStatus(lease, ts, r.mu.minLeaseProposedTS).State == kvserverpb.LeaseState_VALID 775 } 776 777 // newNotLeaseHolderError returns a NotLeaseHolderError initialized with the 778 // replica for the holder (if any) of the given lease. 779 // 780 // Note that this error can be generated on the Raft processing goroutine, so 781 // its output should be completely determined by its parameters. 782 func newNotLeaseHolderError( 783 l *roachpb.Lease, proposerStoreID roachpb.StoreID, rangeDesc *roachpb.RangeDescriptor, 784 ) *roachpb.NotLeaseHolderError { 785 err := &roachpb.NotLeaseHolderError{ 786 RangeID: rangeDesc.RangeID, 787 } 788 if proposerStoreID != 0 { 789 err.Replica, _ = rangeDesc.GetReplicaDescriptor(proposerStoreID) 790 } 791 if l != nil { 792 // Normally, we return the lease-holding Replica here. However, in the 793 // case in which a leader removes itself, we want the followers to 794 // avoid handing out a misleading clue (which in itself shouldn't be 795 // overly disruptive as the lease would expire and then this method 796 // shouldn't be called for it any more, but at the very least it 797 // could catch tests in a loop, presumably due to manual clocks). 798 _, stillMember := rangeDesc.GetReplicaDescriptor(l.Replica.StoreID) 799 if stillMember { 800 err.LeaseHolder = &l.Replica 801 err.Lease = l 802 } 803 } 804 return err 805 } 806 807 // leaseGoodToGo is a fast-path for lease checks which verifies that an 808 // existing lease is valid and owned by the current store. This method should 809 // not be called directly. Use redirectOnOrAcquireLease instead. 810 func (r *Replica) leaseGoodToGo(ctx context.Context) (kvserverpb.LeaseStatus, bool) { 811 timestamp := r.store.Clock().Now() 812 r.mu.RLock() 813 defer r.mu.RUnlock() 814 815 if r.requiresExpiringLeaseRLocked() { 816 // Slow-path for expiration-based leases. 817 return kvserverpb.LeaseStatus{}, false 818 } 819 820 status := r.leaseStatus(*r.mu.state.Lease, timestamp, r.mu.minLeaseProposedTS) 821 if status.State == kvserverpb.LeaseState_VALID && status.Lease.OwnedBy(r.store.StoreID()) { 822 // We own the lease... 823 if repDesc, err := r.getReplicaDescriptorRLocked(); err == nil { 824 if _, ok := r.mu.pendingLeaseRequest.TransferInProgress(repDesc.ReplicaID); !ok { 825 // ...and there is no transfer pending. 826 return status, true 827 } 828 } 829 } 830 return kvserverpb.LeaseStatus{}, false 831 } 832 833 // redirectOnOrAcquireLease checks whether this replica has the lease at the 834 // current timestamp. If it does, returns the lease and its status. If 835 // another replica currently holds the lease, redirects by returning 836 // NotLeaseHolderError. If the lease is expired, a renewal is synchronously 837 // requested. Leases are eagerly renewed when a request with a timestamp 838 // within rangeLeaseRenewalDuration of the lease expiration is served. 839 // 840 // TODO(spencer): for write commands, don't wait while requesting 841 // the range lease. If the lease acquisition fails, the write cmd 842 // will fail as well. If it succeeds, as is likely, then the write 843 // will not incur latency waiting for the command to complete. 844 // Reads, however, must wait. 845 // 846 // TODO(rangeLeaseRenewalDuration): what is rangeLeaseRenewalDuration 847 // referring to? It appears to have rotted. 848 func (r *Replica) redirectOnOrAcquireLease( 849 ctx context.Context, 850 ) (kvserverpb.LeaseStatus, *roachpb.Error) { 851 if status, ok := r.leaseGoodToGo(ctx); ok { 852 return status, nil 853 } 854 855 // Loop until the lease is held or the replica ascertains the actual 856 // lease holder. Returns also on context.Done() (timeout or cancellation). 857 var status kvserverpb.LeaseStatus 858 for attempt := 1; ; attempt++ { 859 timestamp := r.store.Clock().Now() 860 llHandle, pErr := func() (*leaseRequestHandle, *roachpb.Error) { 861 r.mu.Lock() 862 defer r.mu.Unlock() 863 864 status = r.leaseStatus(*r.mu.state.Lease, timestamp, r.mu.minLeaseProposedTS) 865 switch status.State { 866 case kvserverpb.LeaseState_ERROR: 867 // Lease state couldn't be determined. 868 log.VEventf(ctx, 2, "lease state couldn't be determined") 869 return nil, roachpb.NewError( 870 newNotLeaseHolderError(nil, r.store.StoreID(), r.mu.state.Desc)) 871 872 case kvserverpb.LeaseState_VALID, kvserverpb.LeaseState_STASIS: 873 if !status.Lease.OwnedBy(r.store.StoreID()) { 874 _, stillMember := r.mu.state.Desc.GetReplicaDescriptor(status.Lease.Replica.StoreID) 875 if !stillMember { 876 // This would be the situation in which the lease holder gets removed when 877 // holding the lease, or in which a lease request erroneously gets accepted 878 // for a replica that is not in the replica set. Neither of the two can 879 // happen in normal usage since appropriate mechanisms have been added: 880 // 881 // 1. Only the lease holder (at the time) schedules removal of a replica, 882 // but the lease can change hands and so the situation in which a follower 883 // coordinates a replica removal of the (new) lease holder is possible (if 884 // unlikely) in practice. In this situation, the new lease holder would at 885 // some point be asked to propose the replica change's EndTxn to Raft. A 886 // check has been added that prevents proposals that amount to the removal 887 // of the proposer's (and hence lease holder's) Replica, preventing this 888 // scenario. 889 // 890 // 2. A lease is accepted for a Replica that has been removed. Without 891 // precautions, this could happen because lease requests are special in 892 // that they are the only command that is proposed on a follower (other 893 // commands may be proposed from followers, but not successfully so). For 894 // all proposals, processRaftCommand checks that their ProposalLease is 895 // compatible with the active lease for the log position. For commands 896 // proposed on the lease holder, the spanlatch manager then serializes 897 // everything. But lease requests get created on followers based on their 898 // local state and thus without being sequenced through latching. Thus 899 // a recently removed follower (unaware of its own removal) could submit 900 // a proposal for the lease (correctly using as a ProposerLease the last 901 // active lease), and would receive it given the up-to-date ProposerLease. 902 // Hence, an extra check is in order: processRaftCommand makes sure that 903 // lease requests for a replica not in the descriptor are bounced. 904 // 905 // However, this is possible if the `cockroach debug 906 // unsafe-remove-dead-replicas` command has been used, so 907 // this is just a logged error instead of a fatal 908 // assertion. 909 log.Errorf(ctx, "lease %s owned by replica %+v that no longer exists", 910 status.Lease, status.Lease.Replica) 911 } 912 // Otherwise, if the lease is currently held by another replica, redirect 913 // to the holder. 914 return nil, roachpb.NewError( 915 newNotLeaseHolderError(&status.Lease, r.store.StoreID(), r.mu.state.Desc)) 916 } 917 // Check that we're not in the process of transferring the lease away. 918 // If we are transferring the lease away, we can't serve reads or 919 // propose Raft commands - see comments on TransferLease. 920 // TODO(andrei): If the lease is being transferred, consider returning a 921 // new error type so the client backs off until the transfer is 922 // completed. 923 repDesc, err := r.getReplicaDescriptorRLocked() 924 if err != nil { 925 return nil, roachpb.NewError(err) 926 } 927 if transferLease, ok := r.mu.pendingLeaseRequest.TransferInProgress( 928 repDesc.ReplicaID); ok { 929 return nil, roachpb.NewError( 930 newNotLeaseHolderError(&transferLease, r.store.StoreID(), r.mu.state.Desc)) 931 } 932 933 // If the lease is in stasis, we can't serve requests until we've 934 // renewed the lease, so we return the handle to block on renewal. 935 // Otherwise, we don't need to wait for the extension and simply 936 // ignore the returned handle (whose channel is buffered) and continue. 937 if status.State == kvserverpb.LeaseState_STASIS { 938 return r.requestLeaseLocked(ctx, status), nil 939 } 940 941 // Extend the lease if this range uses expiration-based 942 // leases, the lease is in need of renewal, and there's not 943 // already an extension pending. 944 _, requestPending := r.mu.pendingLeaseRequest.RequestPending() 945 if !requestPending && r.requiresExpiringLeaseRLocked() { 946 renewal := status.Lease.Expiration.Add(-r.store.cfg.RangeLeaseRenewalDuration().Nanoseconds(), 0) 947 if renewal.LessEq(timestamp) { 948 if log.V(2) { 949 log.Infof(ctx, "extending lease %s at %s", status.Lease, timestamp) 950 } 951 // We had an active lease to begin with, but we want to trigger 952 // a lease extension. We explicitly ignore the returned handle 953 // as we won't block on it. 954 _ = r.requestLeaseLocked(ctx, status) 955 } 956 } 957 958 case kvserverpb.LeaseState_EXPIRED: 959 // No active lease: Request renewal if a renewal is not already pending. 960 log.VEventf(ctx, 2, "request range lease (attempt #%d)", attempt) 961 return r.requestLeaseLocked(ctx, status), nil 962 963 case kvserverpb.LeaseState_PROSCRIBED: 964 // Lease proposed timestamp is earlier than the min proposed 965 // timestamp limit this replica must observe. If this store 966 // owns the lease, re-request. Otherwise, redirect. 967 if status.Lease.OwnedBy(r.store.StoreID()) { 968 log.VEventf(ctx, 2, "request range lease (attempt #%d)", attempt) 969 return r.requestLeaseLocked(ctx, status), nil 970 } 971 // If lease is currently held by another, redirect to holder. 972 return nil, roachpb.NewError( 973 newNotLeaseHolderError(&status.Lease, r.store.StoreID(), r.mu.state.Desc)) 974 } 975 976 // Return a nil handle to signal that we have a valid lease. 977 return nil, nil 978 }() 979 if pErr != nil { 980 return kvserverpb.LeaseStatus{}, pErr 981 } 982 if llHandle == nil { 983 // We own a valid lease. 984 return status, nil 985 } 986 987 // Wait for the range lease to finish, or the context to expire. 988 pErr = func() (pErr *roachpb.Error) { 989 slowTimer := timeutil.NewTimer() 990 defer slowTimer.Stop() 991 slowTimer.Reset(base.SlowRequestThreshold) 992 tBegin := timeutil.Now() 993 for { 994 select { 995 case pErr = <-llHandle.C(): 996 if pErr != nil { 997 switch tErr := pErr.GetDetail().(type) { 998 case *roachpb.AmbiguousResultError: 999 // This can happen if the RequestLease command we sent has been 1000 // applied locally through a snapshot: the RequestLeaseRequest 1001 // cannot be reproposed so we get this ambiguity. 1002 // We'll just loop around. 1003 return nil 1004 case *roachpb.LeaseRejectedError: 1005 if tErr.Existing.OwnedBy(r.store.StoreID()) { 1006 // The RequestLease command we sent was rejected because another 1007 // lease was applied in the meantime, but we own that other 1008 // lease. So, loop until the current node becomes aware that 1009 // it's the leaseholder. 1010 return nil 1011 } 1012 1013 // Getting a LeaseRejectedError back means someone else got there 1014 // first, or the lease request was somehow invalid due to a concurrent 1015 // change. That concurrent change could have been that this replica was 1016 // removed (see processRaftCommand), so check for that case before 1017 // falling back to a NotLeaseHolderError. 1018 var err error 1019 if _, descErr := r.GetReplicaDescriptor(); descErr != nil { 1020 err = descErr 1021 } else if lease, _ := r.GetLease(); !r.IsLeaseValid(lease, r.store.Clock().Now()) { 1022 err = newNotLeaseHolderError(nil, r.store.StoreID(), r.Desc()) 1023 } else { 1024 err = newNotLeaseHolderError(&lease, r.store.StoreID(), r.Desc()) 1025 } 1026 pErr = roachpb.NewError(err) 1027 } 1028 return pErr 1029 } 1030 log.Eventf(ctx, "lease acquisition succeeded: %+v", status.Lease) 1031 return nil 1032 case <-slowTimer.C: 1033 slowTimer.Read = true 1034 log.Warningf(ctx, "have been waiting %s attempting to acquire lease", 1035 base.SlowRequestThreshold) 1036 r.store.metrics.SlowLeaseRequests.Inc(1) 1037 defer func() { 1038 r.store.metrics.SlowLeaseRequests.Dec(1) 1039 log.Infof(ctx, "slow lease acquisition finished after %s with error %v after %d attempts", timeutil.Since(tBegin), pErr, attempt) 1040 }() 1041 case <-ctx.Done(): 1042 llHandle.Cancel() 1043 log.VErrEventf(ctx, 2, "lease acquisition failed: %s", ctx.Err()) 1044 return roachpb.NewError(newNotLeaseHolderError(nil, r.store.StoreID(), r.Desc())) 1045 case <-r.store.Stopper().ShouldStop(): 1046 llHandle.Cancel() 1047 return roachpb.NewError(newNotLeaseHolderError(nil, r.store.StoreID(), r.Desc())) 1048 } 1049 } 1050 }() 1051 if pErr != nil { 1052 return kvserverpb.LeaseStatus{}, pErr 1053 } 1054 } 1055 }