github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvclient/kvcoord/txn_interceptor_pipeliner.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvcoord 12 13 import ( 14 "context" 15 "fmt" 16 "sort" 17 18 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock" 19 "github.com/cockroachdb/cockroach/pkg/roachpb" 20 "github.com/cockroachdb/cockroach/pkg/settings" 21 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 22 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 23 "github.com/cockroachdb/cockroach/pkg/util/log" 24 "github.com/google/btree" 25 ) 26 27 // The degree of the inFlightWrites btree. 28 const txnPipelinerBtreeDegree = 32 29 30 var pipelinedWritesEnabled = settings.RegisterBoolSetting( 31 "kv.transaction.write_pipelining_enabled", 32 "if enabled, transactional writes are pipelined through Raft consensus", 33 true, 34 ) 35 var pipelinedWritesMaxInFlightSize = settings.RegisterByteSizeSetting( 36 // TODO(nvanbenschoten): The need for this extra setting alongside 37 // kv.transaction.max_intents_bytes indicates that we should explore 38 // the unification of intent tracking and in-flight write tracking. 39 // The two mechanisms track subtly different information, but there's 40 // no fundamental reason why they can't be unified. 41 "kv.transaction.write_pipelining_max_outstanding_size", 42 "maximum number of bytes used to track in-flight pipelined writes before disabling pipelining", 43 1<<18, /* 256 KB */ 44 ) 45 var pipelinedWritesMaxBatchSize = settings.RegisterNonNegativeIntSetting( 46 "kv.transaction.write_pipelining_max_batch_size", 47 "if non-zero, defines that maximum size batch that will be pipelined through Raft consensus", 48 // NB: there is a tradeoff between the overhead of synchronously waiting for 49 // consensus for a batch if we don't pipeline and proving that all of the 50 // writes in the batch succeed if we do pipeline. We set this default to a 51 // value which experimentally strikes a balance between the two costs. 52 // 53 // Notably, this is well below sql.max{Insert/Update/Upsert/Delete}BatchSize, 54 // so implicit SQL txns should never pipeline their writes - they should either 55 // hit the 1PC fast-path or should have batches which exceed this limit. 56 128, 57 ) 58 59 // trackedWritesMaxSize is a threshold in bytes for lock spans stored on the 60 // coordinator during the lifetime of a transaction. Locks are included with a 61 // transaction on commit or abort, to be cleaned up asynchronously. If they 62 // exceed this threshold, they're condensed to avoid memory blowup both on the 63 // coordinator and (critically) on the EndTxn command at the Raft group 64 // responsible for the transaction record. 65 // 66 // NB: this is called "max_intents_bytes" instead of "max_lock_bytes" because 67 // it was created before the concept of intents were generalized to locks. 68 // Switching it would require a migration which doesn't seem worth it. 69 var trackedWritesMaxSize = settings.RegisterPublicIntSetting( 70 "kv.transaction.max_intents_bytes", 71 "maximum number of bytes used to track locks in transactions", 72 1<<18, /* 256 KB */ 73 ) 74 75 // txnPipeliner is a txnInterceptor that pipelines transactional writes by using 76 // asynchronous consensus. The interceptor then tracks all writes that have been 77 // asynchronously proposed through Raft and ensures that all interfering 78 // requests chain on to them by first proving that the async writes succeeded. 79 // The interceptor also ensures that when committing a transaction all writes 80 // that have been proposed but not proven to have succeeded are first checked 81 // before considering the transaction committed. These async writes are referred 82 // to as "in-flight writes" and this process of proving that an in-flight write 83 // succeeded is called "proving" the write. Once writes are proven to have 84 // finished, they are considered "stable". 85 // 86 // Chaining on to in-flight async writes is important for two main reasons to 87 // txnPipeliner: 88 // 89 // 1. requests proposed to Raft will not necessarily succeed. For any number of 90 // reasons, the request may make it through Raft and be discarded or fail to 91 // ever even be replicated. A transaction must check that all async writes 92 // succeeded before committing. However, when these proposals do fail, their 93 // errors aren't particularly interesting to a transaction. This is because 94 // these errors are not deterministic Transaction-domain errors that a 95 // transaction must adhere to for correctness such as conditional-put errors or 96 // other symptoms of constraint violations. These kinds of errors are all 97 // discovered during write *evaluation*, which an async write will perform 98 // synchronously before consensus. Any error during consensus is outside of the 99 // Transaction-domain and can always trigger a transaction retry. 100 // 101 // 2. transport layers beneath the txnPipeliner do not provide strong enough 102 // ordering guarantees between concurrent requests in the same transaction to 103 // avoid needing explicit chaining. For instance, DistSender uses unary gRPC 104 // requests instead of gRPC streams, so it can't natively expose strong ordering 105 // guarantees. Perhaps more importantly, even when a command has acquired latches 106 // and evaluated on a Replica, it is not guaranteed to be applied before 107 // interfering commands. This is because the command may be retried outside of 108 // the serialization of the spanlatch manager for any number of reasons, such as 109 // leaseholder changes. When the command re-acquired its latches, it's possible 110 // that interfering commands may jump ahead of it. To combat this, the 111 // txnPipeliner uses chaining to throw an error when these re-orderings would 112 // have affected the order that transactional requests evaluate in. 113 // 114 // The interceptor proves all in-flight writes before explicitly committing a 115 // transaction by tacking on a QueryIntent request for each one to the front of 116 // an EndTxn(Commit=true) request. The in-flight writes that are being queried 117 // in the batch with the EndTxn request are treated as in-flight writes for the 118 // purposes of parallel commits. The effect of this is that the in-flight writes 119 // must all be proven for a transaction to be considered implicitly committed. 120 // It also follows that they will need to be queried during transaction 121 // recovery. 122 // 123 // This is beneficial from the standpoint of latency because it means that the 124 // consensus latency for every write in a transaction, including the write to 125 // the transaction record, is paid in parallel (mod pipeline stalls) and an 126 // entire transaction can commit in a single consensus round-trip! 127 // 128 // On the flip side, this means that every unproven write is considered 129 // in-flight at the time of the commit and needs to be proven at the time of the 130 // commit. This is a little unfortunate because a transaction could have 131 // accumulated a large number of in-flight writes over a long period of time 132 // without proving any of them, and the more of these writes there are, the 133 // greater the chance that querying one of them gets delayed and delays the 134 // overall transaction. Additionally, the more of these writes there are, the 135 // more expensive transaction recovery will be if the transaction ends up stuck 136 // in an indeterminate commit state. 137 // 138 // Three approaches have been considered to address this, all of which revolve 139 // around the idea that earlier writes in a transaction may have finished 140 // consensus well before the EndTxn is sent. Following this logic, it would be 141 // in the txnPipeliner's best interest to prove in-flight writes as early as 142 // possible, even if no other overlapping requests force them to be proven. The 143 // approaches are: 144 // 145 // 1. launch a background process after each successful async write to query its 146 // intents and wait for it to succeed. This would effectively solve the issue, 147 // but at the cost of many more goroutines and many more QueryIntent requests, 148 // most of which would be redundant because their corresponding write wouldn't 149 // complete until after an EndTxn synchronously needed to prove them anyway. 150 // 151 // 2. to address the issue of an unbounded number of background goroutines 152 // proving writes in approach 1, a single background goroutine could be run 153 // that repeatedly loops over all in-flight writes and attempts to prove 154 // them. This approach was used in an early revision of #26599 and has the nice 155 // property that only one batch of QueryIntent requests is ever active at a 156 // given time. It may be revisited, but for now it is not used for the same 157 // reason as approach 1: most of its QueryIntent requests will be useless 158 // because a transaction will send an EndTxn immediately after sending all 159 // of its writes. 160 // 161 // 3. turn the KV interface into a streaming protocol (#8360) that could support 162 // returning multiple results. This would allow clients to return immediately 163 // after a writes "evaluation" phase completed but hold onto a handle to the 164 // request and be notified immediately after its "replication" phase completes. 165 // This would allow txnPipeliner to prove in-flight writes immediately after 166 // they finish consensus without any extra RPCs. 167 // 168 // So far, none of these approaches have been integrated. 169 // 170 // The txnPipeliner also tracks the locks that a transaction has acquired in a 171 // set of spans known as the "lock footprint". This lock footprint contains 172 // spans encompassing all keys and key ranges where locks have been acquired at 173 // some point by the transaction. This set includes the bounds of locks acquired 174 // by all locking read and write requests. Additionally, it includes the bounds 175 // of locks acquired by the current and all previous epochs. These spans are 176 // attached to any end transaction request that is passed through the pipeliner 177 // to ensure that they the locks within them are released. 178 type txnPipeliner struct { 179 st *cluster.Settings 180 riGen rangeIteratorFactory // used to condense lock spans, if provided 181 wrapped lockedSender 182 disabled bool 183 184 // In-flight writes are intent point writes that have not yet been proved 185 // to have succeeded. They will need to be proven before the transaction 186 // can commit. 187 ifWrites inFlightWriteSet 188 // The transaction's lock footprint contains spans where locks (replicated 189 // and unreplicated) have been acquired at some point by the transaction. 190 // The span set contains spans encompassing the keys from all intent writes 191 // that have already been proven during this epoch and the keys from all 192 // locking reads that have been performed during this epoch. Additionally, 193 // the span set contains all locks held at the end of prior epochs. All of 194 // the transaction's in-flight writes are morally in this set as well, but 195 // they are not stored here to avoid duplication. 196 // 197 // Unlike the in-flight writes, this set does not need to be tracked with 198 // full precision. Instead, the tracking can be an overestimate (i.e. the 199 // spans may cover keys never locked) and should be thought of as an 200 // upper-bound on the influence that the transaction has had. The set 201 // contains all keys spans that the transaction will need to eventually 202 // clean up upon its completion. 203 lockFootprint condensableSpanSet 204 } 205 206 // condensableSpanSetRangeIterator describes the interface of RangeIterator 207 // needed by the condensableSpanSetRangeIterator. Useful for mocking an 208 // iterator in tests. 209 type condensableSpanSetRangeIterator interface { 210 Valid() bool 211 Seek(ctx context.Context, key roachpb.RKey, scanDir ScanDirection) 212 Error() error 213 Desc() *roachpb.RangeDescriptor 214 } 215 216 // rangeIteratorFactory is used to create a condensableSpanSetRangeIterator 217 // lazily. It's used to avoid allocating an iterator when it's not needed. The 218 // factory can be configured either with a callback, used for mocking in tests, 219 // or with a DistSender. Can also be left empty for unittests that don't push 220 // memory limits in their span sets (and thus don't need collapsing). 221 type rangeIteratorFactory struct { 222 factory func() condensableSpanSetRangeIterator 223 ds *DistSender 224 } 225 226 // newRangeIterator creates a range iterator. If no factory was configured, it panics. 227 func (f rangeIteratorFactory) newRangeIterator() condensableSpanSetRangeIterator { 228 if f.factory != nil { 229 return f.factory() 230 } 231 if f.ds != nil { 232 return NewRangeIterator(f.ds) 233 } 234 panic("no iterator factory configured") 235 } 236 237 // SendLocked implements the lockedSender interface. 238 func (tp *txnPipeliner) SendLocked( 239 ctx context.Context, ba roachpb.BatchRequest, 240 ) (*roachpb.BatchResponse, *roachpb.Error) { 241 // If an EndTxn request is part of this batch, attach the in-flight writes 242 // and the lock footprint to it. 243 ba, pErr := tp.attachLocksToEndTxn(ctx, ba) 244 if pErr != nil { 245 return nil, pErr 246 } 247 248 // Adjust the batch so that it doesn't miss any in-flight writes. 249 ba = tp.chainToInFlightWrites(ba) 250 251 // Send through wrapped lockedSender. Unlocks while sending then re-locks. 252 br, pErr := tp.wrapped.SendLocked(ctx, ba) 253 254 // Update the in-flight write set and the lock footprint with the results of 255 // the request. 256 tp.updateLockTracking(ctx, ba, br) 257 if pErr != nil { 258 return nil, tp.adjustError(ctx, ba, pErr) 259 } 260 return tp.stripQueryIntents(br), nil 261 } 262 263 // attachLocksToEndTxn attaches the in-flight writes and the lock footprint that 264 // the interceptor has been tracking to any EndTxn requests present in the 265 // provided batch. It augments these sets with locking requests from the current 266 // batch. 267 func (tp *txnPipeliner) attachLocksToEndTxn( 268 ctx context.Context, ba roachpb.BatchRequest, 269 ) (roachpb.BatchRequest, *roachpb.Error) { 270 args, hasET := ba.GetArg(roachpb.EndTxn) 271 if !hasET { 272 return ba, nil 273 } 274 et := args.(*roachpb.EndTxnRequest) 275 if len(et.LockSpans) > 0 { 276 return ba, roachpb.NewErrorf("client must not pass intents to EndTxn") 277 } 278 if len(et.InFlightWrites) > 0 { 279 return ba, roachpb.NewErrorf("client must not pass in-flight writes to EndTxn") 280 } 281 282 // Populate et.LockSpans and et.InFlightWrites. 283 if !tp.lockFootprint.empty() { 284 et.LockSpans = append([]roachpb.Span(nil), tp.lockFootprint.asSlice()...) 285 } 286 if inFlight := tp.ifWrites.len(); inFlight != 0 { 287 et.InFlightWrites = make([]roachpb.SequencedWrite, 0, inFlight) 288 tp.ifWrites.ascend(func(w *inFlightWrite) { 289 et.InFlightWrites = append(et.InFlightWrites, w.SequencedWrite) 290 }) 291 } 292 293 // Augment et.LockSpans and et.InFlightWrites with writes from the current 294 // batch. 295 for _, ru := range ba.Requests[:len(ba.Requests)-1] { 296 req := ru.GetInner() 297 h := req.Header() 298 if roachpb.IsLocking(req) { 299 // Ranged writes are added immediately to the lock spans because 300 // it's not clear where they will actually leave intents. Point 301 // writes are added to the in-flight writes set. All other locking 302 // requests are also added to the lock spans. 303 // 304 // If we see any ranged writes then we know that the txnCommitter 305 // will fold the in-flight writes into the lock spans immediately 306 // and forgo a parallel commit, but let's not break that abstraction 307 // boundary here. 308 if roachpb.IsIntentWrite(req) && !roachpb.IsRange(req) { 309 w := roachpb.SequencedWrite{Key: h.Key, Sequence: h.Sequence} 310 et.InFlightWrites = append(et.InFlightWrites, w) 311 } else { 312 et.LockSpans = append(et.LockSpans, h.Span()) 313 } 314 } 315 } 316 317 // Sort both sets and condense the lock spans. 318 et.LockSpans, _ = roachpb.MergeSpans(et.LockSpans) 319 sort.Sort(roachpb.SequencedWriteBySeq(et.InFlightWrites)) 320 321 if log.V(3) { 322 for _, intent := range et.LockSpans { 323 log.Infof(ctx, "intent: [%s,%s)", intent.Key, intent.EndKey) 324 } 325 for _, write := range et.InFlightWrites { 326 log.Infof(ctx, "in-flight: %d:%s", write.Sequence, write.Key) 327 } 328 } 329 return ba, nil 330 } 331 332 // chainToInFlightWrites ensures that we "chain" on to any in-flight writes that 333 // overlap the keys we're trying to read/write. We do this by prepending 334 // QueryIntent requests with the ErrorIfMissing option before each request that 335 // touches any of the in-flight writes. In effect, this allows us to prove that 336 // a write succeeded before depending on its existence. We later prune down the 337 // list of writes we proved to exist that are no longer "in-flight" in 338 // updateLockTracking. 339 func (tp *txnPipeliner) chainToInFlightWrites(ba roachpb.BatchRequest) roachpb.BatchRequest { 340 asyncConsensus := pipelinedWritesEnabled.Get(&tp.st.SV) && !tp.disabled 341 342 // We provide a setting to bound the size of in-flight writes that the 343 // pipeliner is tracking. If this batch would push us over this setting, 344 // don't allow it to perform async consensus. 345 addedIFBytes := int64(0) 346 maxIFBytes := pipelinedWritesMaxInFlightSize.Get(&tp.st.SV) 347 348 // We provide a setting to bound the number of writes we permit in a batch 349 // that uses async consensus. This is useful because we'll have to prove 350 // each write that uses async consensus using a QueryIntent, so there's a 351 // point where it makes more sense to just perform consensus for the entire 352 // batch synchronously and avoid all of the overhead of pipelining. 353 if maxBatch := pipelinedWritesMaxBatchSize.Get(&tp.st.SV); maxBatch > 0 { 354 batchSize := int64(len(ba.Requests)) 355 if batchSize > maxBatch { 356 asyncConsensus = false 357 } 358 } 359 360 forked := false 361 oldReqs := ba.Requests 362 // TODO(nvanbenschoten): go 1.11 includes an optimization to quickly clear 363 // out an entire map. That might make it cost effective to maintain a single 364 // chainedKeys map between calls to this function. 365 var chainedKeys map[string]struct{} 366 for i, ru := range oldReqs { 367 if !asyncConsensus && !forked && tp.ifWrites.len() == len(chainedKeys) { 368 // If there are no in-flight writes or all in-flight writes 369 // have been chained onto and async consensus is disallowed, 370 // short-circuit immediately. 371 break 372 } 373 req := ru.GetInner() 374 375 if asyncConsensus { 376 // If we're currently planning on performing the batch with 377 // performing async consensus, determine whether this request 378 // changes that. 379 if !roachpb.IsIntentWrite(req) || roachpb.IsRange(req) { 380 // Only allow batches consisting of solely transactional point 381 // writes to perform consensus asynchronously. 382 // TODO(nvanbenschoten): We could allow batches with reads and point 383 // writes to perform async consensus, but this would be a bit 384 // tricky. Any read would need to chain on to any write that came 385 // before it in the batch and overlaps. For now, it doesn't seem 386 // worth it. 387 asyncConsensus = false 388 } else { 389 // Only allow batches that would not push us over the maximum 390 // in-flight write size limit to perform consensus asynchronously. 391 // 392 // NB: this estimation is conservative because it doesn't factor 393 // in that some writes may be proven by this batch and removed 394 // from the in-flight write set. The real accounting in 395 // inFlightWriteSet.{insert,remove} gets this right. 396 addedIFBytes += keySize(req.Header().Key) 397 asyncConsensus = (tp.ifWrites.byteSize() + addedIFBytes) <= maxIFBytes 398 } 399 } 400 401 if tp.ifWrites.len() > len(chainedKeys) { 402 // For each conflicting in-flight write, add a QueryIntent request 403 // to the batch to assert that it has succeeded and "chain" onto it. 404 writeIter := func(w *inFlightWrite) { 405 // We don't want to modify the batch's request slice directly, 406 // so fork it before modifying it. 407 if !forked { 408 ba.Requests = append([]roachpb.RequestUnion(nil), ba.Requests[:i]...) 409 forked = true 410 } 411 412 if _, ok := chainedKeys[string(w.Key)]; !ok { 413 // The write has not already been chained onto by an earlier 414 // request in this batch. Add a QueryIntent request to the 415 // batch (before the conflicting request) to ensure that we 416 // chain on to the success of the in-flight write. 417 meta := ba.Txn.TxnMeta 418 meta.Sequence = w.Sequence 419 ba.Add(&roachpb.QueryIntentRequest{ 420 RequestHeader: roachpb.RequestHeader{ 421 Key: w.Key, 422 }, 423 Txn: meta, 424 ErrorIfMissing: true, 425 }) 426 427 // Record that the key has been chained onto at least once 428 // in this batch so that we don't chain onto it again. 429 if chainedKeys == nil { 430 chainedKeys = make(map[string]struct{}) 431 } 432 chainedKeys[string(w.Key)] = struct{}{} 433 } 434 } 435 436 if !roachpb.IsTransactional(req) { 437 // Non-transactional requests require that we stall the entire 438 // pipeline by chaining on to all in-flight writes. This is 439 // because their request header is often insufficient to 440 // determine all of the keys that they will interact with. 441 tp.ifWrites.ascend(writeIter) 442 } else if et, ok := req.(*roachpb.EndTxnRequest); ok { 443 if et.Commit { 444 // EndTxns need to prove all in-flight writes before being 445 // allowed to succeed themselves. 446 tp.ifWrites.ascend(writeIter) 447 } 448 } else { 449 // Transactional reads and writes needs to chain on to any 450 // overlapping in-flight writes. 451 s := req.Header().Span() 452 tp.ifWrites.ascendRange(s.Key, s.EndKey, writeIter) 453 } 454 } 455 456 // If the BatchRequest's slice of requests has been forked from the original, 457 // append the request to the new slice. 458 if forked { 459 ba.Add(req) 460 } 461 } 462 463 // Set the batch's AsyncConsensus flag based on whether AsyncConsensus is 464 // permitted for the batch. 465 ba.AsyncConsensus = asyncConsensus 466 return ba 467 } 468 469 // updateLockTracking reads the response for the given request and uses it to 470 // update the tracked in-flight write set and lock footprint. It does so by 471 // performing three actions: 472 // 1. it adds all async writes that the request performed to the in-flight 473 // write set. 474 // 2. it adds all non-async writes and locking reads that the request 475 // performed to the lock footprint. 476 // 3. it moves all in-flight writes that the request proved to exist from 477 // the in-flight writes set to the lock footprint. 478 // 479 // After updating the write sets, the lock footprint is condensed to ensure that 480 // it remains under its memory limit. 481 // 482 // If no response is provided (indicating an error), all writes from the batch 483 // are added directly to the lock footprint to avoid leaking any locks when the 484 // transaction cleans up. 485 func (tp *txnPipeliner) updateLockTracking( 486 ctx context.Context, ba roachpb.BatchRequest, br *roachpb.BatchResponse, 487 ) { 488 // After adding new writes to the lock footprint, check whether we need to 489 // condense the set to stay below memory limits. 490 defer tp.lockFootprint.maybeCondense(ctx, tp.riGen, trackedWritesMaxSize.Get(&tp.st.SV)) 491 492 // If the request failed, add all lock acquisitions attempts directly to the 493 // lock footprint. This reduces the likelihood of dangling locks blocking 494 // concurrent requests for extended periods of time. See #3346. 495 if br == nil { 496 // The transaction cannot continue in this epoch whether this is 497 // a retryable error or not. 498 ba.LockSpanIterate(nil, tp.trackLocks) 499 return 500 } 501 502 // Similarly, if the transaction is now finalized, we don't need to 503 // accurately update the lock tracking. 504 if br.Txn.Status.IsFinalized() { 505 switch br.Txn.Status { 506 case roachpb.ABORTED: 507 // If the transaction is now ABORTED, add all locks acquired by the 508 // batch directly to the lock footprint. We don't know which of 509 // these succeeded. 510 ba.LockSpanIterate(nil, tp.trackLocks) 511 case roachpb.COMMITTED: 512 // If the transaction is now COMMITTED, it must not have any more 513 // in-flight writes, so clear them. Technically we should move all 514 // of these to the lock footprint, but since the transaction is 515 // already committed, there's no reason to. 516 tp.ifWrites.clear( 517 /* reuse - we're not going to use this Btree again, so there's no point in 518 moving the nodes to a free list */ 519 false) 520 default: 521 panic("unexpected") 522 } 523 return 524 } 525 526 for i, ru := range ba.Requests { 527 req := ru.GetInner() 528 resp := br.Responses[i].GetInner() 529 530 if qiReq, ok := req.(*roachpb.QueryIntentRequest); ok { 531 // Remove any in-flight writes that were proven to exist. 532 // It shouldn't be possible for a QueryIntentRequest with 533 // the ErrorIfMissing option set to return without error 534 // and with with FoundIntent=false, but we handle that 535 // case here because it happens a lot in tests. 536 if resp.(*roachpb.QueryIntentResponse).FoundIntent { 537 tp.ifWrites.remove(qiReq.Key, qiReq.Txn.Sequence) 538 // Move to lock footprint. 539 tp.lockFootprint.insert(roachpb.Span{Key: qiReq.Key}) 540 } 541 } else if roachpb.IsLocking(req) { 542 // If the request intended to acquire locks, track its lock spans. 543 if ba.AsyncConsensus { 544 // Record any writes that were performed asynchronously. We'll 545 // need to prove that these succeeded sometime before we commit. 546 header := req.Header() 547 tp.ifWrites.insert(header.Key, header.Sequence) 548 } else { 549 // If the lock acquisitions weren't performed asynchronously 550 // then add them directly to our lock footprint. Locking read 551 // requests will always hit this path because they will never 552 // use async consensus. 553 if sp, ok := roachpb.ActualSpan(req, resp); ok { 554 tp.lockFootprint.insert(sp) 555 } 556 } 557 } 558 } 559 } 560 561 func (tp *txnPipeliner) trackLocks(s roachpb.Span, _ lock.Durability) { 562 tp.lockFootprint.insert(s) 563 } 564 565 // stripQueryIntents adjusts the BatchResponse to hide the fact that this 566 // interceptor added new requests to the batch. It returns an adjusted batch 567 // response without the responses that correspond to these added requests. 568 func (tp *txnPipeliner) stripQueryIntents(br *roachpb.BatchResponse) *roachpb.BatchResponse { 569 j := 0 570 for i, ru := range br.Responses { 571 if ru.GetQueryIntent() != nil { 572 continue 573 } 574 if i != j { 575 br.Responses[j] = br.Responses[i] 576 } 577 j++ 578 } 579 br.Responses = br.Responses[:j] 580 return br 581 } 582 583 // adjustError adjusts the provided error based on the request that caused it. 584 // It transforms any IntentMissingError into a TransactionRetryError and fixes 585 // the error's index position. 586 func (tp *txnPipeliner) adjustError( 587 ctx context.Context, ba roachpb.BatchRequest, pErr *roachpb.Error, 588 ) *roachpb.Error { 589 // Fix the error index to hide the impact of any QueryIntent requests. 590 if pErr.Index != nil { 591 before := int32(0) 592 for _, ru := range ba.Requests[:int(pErr.Index.Index)] { 593 req := ru.GetInner() 594 if req.Method() == roachpb.QueryIntent { 595 before++ 596 } 597 } 598 pErr.Index.Index -= before 599 } 600 601 // Turn an IntentMissingError into a transactional retry error. 602 if ime, ok := pErr.GetDetail().(*roachpb.IntentMissingError); ok { 603 log.VEventf(ctx, 2, "transforming intent missing error into retry: %v", ime) 604 err := roachpb.NewTransactionRetryError( 605 roachpb.RETRY_ASYNC_WRITE_FAILURE, fmt.Sprintf("missing intent on: %s", ime.Key)) 606 retryErr := roachpb.NewErrorWithTxn(err, pErr.GetTxn()) 607 retryErr.Index = pErr.Index 608 return retryErr 609 } 610 return pErr 611 } 612 613 // setWrapped implements the txnInterceptor interface. 614 func (tp *txnPipeliner) setWrapped(wrapped lockedSender) { 615 tp.wrapped = wrapped 616 } 617 618 // populateLeafInputState is part of the txnInterceptor interface. 619 func (tp *txnPipeliner) populateLeafInputState(tis *roachpb.LeafTxnInputState) { 620 tis.InFlightWrites = tp.ifWrites.asSlice() 621 } 622 623 // initializeLeaf loads the in-flight writes for a leaf transaction. 624 func (tp *txnPipeliner) initializeLeaf(tis *roachpb.LeafTxnInputState) { 625 // Copy all in-flight writes into the inFlightWrite tree. 626 for _, w := range tis.InFlightWrites { 627 tp.ifWrites.insert(w.Key, w.Sequence) 628 } 629 } 630 631 // populateLeafFinalState is part of the txnInterceptor interface. 632 func (tp *txnPipeliner) populateLeafFinalState(*roachpb.LeafTxnFinalState) {} 633 634 // importLeafFinalState is part of the txnInterceptor interface. 635 func (tp *txnPipeliner) importLeafFinalState(context.Context, *roachpb.LeafTxnFinalState) {} 636 637 // epochBumpedLocked implements the txnReqInterceptor interface. 638 func (tp *txnPipeliner) epochBumpedLocked() { 639 // Move all in-flight writes into the lock footprint. These writes no longer 640 // need to be tracked precisely, but we don't want to forget about them and 641 // fail to clean them up. 642 if tp.ifWrites.len() > 0 { 643 tp.ifWrites.ascend(func(w *inFlightWrite) { 644 tp.lockFootprint.insert(roachpb.Span{Key: w.Key}) 645 }) 646 tp.lockFootprint.mergeAndSort() 647 tp.ifWrites.clear(true /* reuse */) 648 } 649 } 650 651 // createSavepointLocked is part of the txnReqInterceptor interface. 652 func (tp *txnPipeliner) createSavepointLocked(context.Context, *savepoint) {} 653 654 // rollbackToSavepointLocked is part of the txnReqInterceptor interface. 655 func (tp *txnPipeliner) rollbackToSavepointLocked(ctx context.Context, s savepoint) { 656 // Move all the writes in txnPipeliner that are not in the savepoint to the 657 // lock footprint. We no longer care if these write succeed or fail, so we're 658 // going to stop tracking these as in-flight writes. The respective intents 659 // still need to be cleaned up at the end of the transaction. 660 var writesToDelete []*inFlightWrite 661 needCollecting := !s.Initial() 662 tp.ifWrites.ascend(func(w *inFlightWrite) { 663 if w.Sequence > s.seqNum { 664 tp.lockFootprint.insert(roachpb.Span{Key: w.Key}) 665 if needCollecting { 666 writesToDelete = append(writesToDelete, w) 667 } 668 } 669 }) 670 tp.lockFootprint.mergeAndSort() 671 672 // Restore the inflight writes from the savepoint (minus the ones that have 673 // been verified in the meantime) by removing all the extra ones. 674 if needCollecting { 675 for _, ifw := range writesToDelete { 676 tp.ifWrites.remove(ifw.Key, ifw.Sequence) 677 } 678 } else { 679 tp.ifWrites.clear(true /* reuse */) 680 } 681 } 682 683 // closeLocked implements the txnReqInterceptor interface. 684 func (tp *txnPipeliner) closeLocked() {} 685 686 // hasAcquiredLocks returns whether the interceptor has made an attempt to 687 // acquire any locks, whether doing so was known to be successful or not. 688 func (tp *txnPipeliner) hasAcquiredLocks() bool { 689 return tp.ifWrites.len() > 0 || !tp.lockFootprint.empty() 690 } 691 692 // inFlightWrites represent a commitment to proving (via QueryIntent) that 693 // a point write succeeded in replicating an intent with a specific sequence 694 // number. 695 type inFlightWrite struct { 696 roachpb.SequencedWrite 697 } 698 699 // Less implements the btree.Item interface. 700 func (a *inFlightWrite) Less(b btree.Item) bool { 701 return a.Key.Compare(b.(*inFlightWrite).Key) < 0 702 } 703 704 // inFlightWriteSet is an ordered set of in-flight point writes. Given a set 705 // of n elements, the structure supports O(log n) insertion of new in-flight 706 // writes, O(log n) removal of existing in-flight writes, and O(m + log n) 707 // retrieval over m in-flight writes that overlap with a given key. 708 type inFlightWriteSet struct { 709 t *btree.BTree 710 bytes int64 711 712 // Avoids allocs. 713 tmp1, tmp2 inFlightWrite 714 alloc inFlightWriteAlloc 715 } 716 717 // insert attempts to insert an in-flight write that has not been proven to have 718 // succeeded into the in-flight write set. If the write with an equal or larger 719 // sequence number already exists in the set, the method is a no-op. 720 func (s *inFlightWriteSet) insert(key roachpb.Key, seq enginepb.TxnSeq) { 721 if s.t == nil { 722 // Lazily initialize btree. 723 s.t = btree.New(txnPipelinerBtreeDegree) 724 } 725 726 s.tmp1.Key = key 727 item := s.t.Get(&s.tmp1) 728 if item != nil { 729 otherW := item.(*inFlightWrite) 730 if seq > otherW.Sequence { 731 // Existing in-flight write has old information. 732 otherW.Sequence = seq 733 } 734 return 735 } 736 737 w := s.alloc.alloc(key, seq) 738 s.t.ReplaceOrInsert(w) 739 s.bytes += keySize(key) 740 } 741 742 // remove attempts to remove an in-flight write from the in-flight write set. 743 // The method will be a no-op if the write was already proved. Care is taken 744 // not to accidentally remove a write to the same key but at a later epoch or 745 // sequence number. 746 func (s *inFlightWriteSet) remove(key roachpb.Key, seq enginepb.TxnSeq) { 747 if s.len() == 0 { 748 // Set is empty. 749 return 750 } 751 752 s.tmp1.Key = key 753 item := s.t.Get(&s.tmp1) 754 if item == nil { 755 // The write was already proven or the txn epoch was incremented. 756 return 757 } 758 759 w := item.(*inFlightWrite) 760 if seq < w.Sequence { 761 // The sequence might have changed, which means that a new write was 762 // sent to the same key. This write would have been forced to prove 763 // the existence of current write already. 764 return 765 } 766 767 // Delete the write from the in-flight writes set. 768 delItem := s.t.Delete(item) 769 if delItem != nil { 770 *delItem.(*inFlightWrite) = inFlightWrite{} // for GC 771 } 772 s.bytes -= keySize(key) 773 774 // Assert that the byte accounting is believable. 775 if s.bytes < 0 { 776 panic("negative in-flight write size") 777 } else if s.t.Len() == 0 && s.bytes != 0 { 778 panic("non-zero in-flight write size with 0 in-flight writes") 779 } 780 } 781 782 // ascend calls the provided function for every write in the set. 783 func (s *inFlightWriteSet) ascend(f func(w *inFlightWrite)) { 784 if s.len() == 0 { 785 // Set is empty. 786 return 787 } 788 s.t.Ascend(func(i btree.Item) bool { 789 f(i.(*inFlightWrite)) 790 return true 791 }) 792 } 793 794 // ascendRange calls the provided function for every write in the set 795 // with a key in the range [start, end). 796 func (s *inFlightWriteSet) ascendRange(start, end roachpb.Key, f func(w *inFlightWrite)) { 797 if s.len() == 0 { 798 // Set is empty. 799 return 800 } 801 if end == nil { 802 // Point lookup. 803 s.tmp1.Key = start 804 if i := s.t.Get(&s.tmp1); i != nil { 805 f(i.(*inFlightWrite)) 806 } 807 } else { 808 // Range lookup. 809 s.tmp1.Key, s.tmp2.Key = start, end 810 s.t.AscendRange(&s.tmp1, &s.tmp2, func(i btree.Item) bool { 811 f(i.(*inFlightWrite)) 812 return true 813 }) 814 } 815 } 816 817 // len returns the number of the in-flight writes in the set. 818 func (s *inFlightWriteSet) len() int { 819 if s.t == nil { 820 return 0 821 } 822 return s.t.Len() 823 } 824 825 // byteSize returns the size in bytes of the in-flight writes in the set. 826 func (s *inFlightWriteSet) byteSize() int64 { 827 return s.bytes 828 } 829 830 // clear purges all elements from the in-flight write set and frees associated 831 // memory. The reuse flag indicates whether the caller is intending to reuse 832 // the set or not. 833 func (s *inFlightWriteSet) clear(reuse bool) { 834 if s.t == nil { 835 return 836 } 837 s.t.Clear(reuse /* addNodesToFreelist */) 838 s.bytes = 0 839 s.alloc.clear() 840 } 841 842 // asSlice returns the in-flight writes, ordered by key. 843 func (s *inFlightWriteSet) asSlice() []roachpb.SequencedWrite { 844 l := s.len() 845 if l == 0 { 846 return nil 847 } 848 writes := make([]roachpb.SequencedWrite, 0, l) 849 s.ascend(func(w *inFlightWrite) { 850 writes = append(writes, w.SequencedWrite) 851 }) 852 return writes 853 } 854 855 // inFlightWriteAlloc provides chunk allocation of inFlightWrites, 856 // amortizing the overhead of each allocation. 857 type inFlightWriteAlloc []inFlightWrite 858 859 // alloc allocates a new inFlightWrite with the specified key and sequence 860 // number. 861 func (a *inFlightWriteAlloc) alloc(key roachpb.Key, seq enginepb.TxnSeq) *inFlightWrite { 862 // If the current alloc slice has no extra capacity, reallocate a new chunk. 863 if cap(*a)-len(*a) == 0 { 864 const chunkAllocMinSize = 4 865 const chunkAllocMaxSize = 1024 866 867 allocSize := cap(*a) * 2 868 if allocSize < chunkAllocMinSize { 869 allocSize = chunkAllocMinSize 870 } else if allocSize > chunkAllocMaxSize { 871 allocSize = chunkAllocMaxSize 872 } 873 *a = make([]inFlightWrite, 0, allocSize) 874 } 875 876 *a = (*a)[:len(*a)+1] 877 w := &(*a)[len(*a)-1] 878 *w = inFlightWrite{ 879 SequencedWrite: roachpb.SequencedWrite{Key: key, Sequence: seq}, 880 } 881 return w 882 } 883 884 // clear removes all allocated in-flight writes and attempts to reclaim as 885 // much allocated memory as possible. 886 func (a *inFlightWriteAlloc) clear() { 887 for i := range *a { 888 (*a)[i] = inFlightWrite{} // for GC 889 } 890 *a = (*a)[:0] 891 }