github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvclient/kvcoord/txn_interceptor_committer.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvcoord 12 13 import ( 14 "context" 15 "sync" 16 17 "github.com/cockroachdb/cockroach/pkg/roachpb" 18 "github.com/cockroachdb/cockroach/pkg/settings" 19 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 20 "github.com/cockroachdb/cockroach/pkg/util" 21 "github.com/cockroachdb/cockroach/pkg/util/log" 22 "github.com/cockroachdb/cockroach/pkg/util/stop" 23 ) 24 25 var parallelCommitsEnabled = settings.RegisterBoolSetting( 26 "kv.transaction.parallel_commits_enabled", 27 "if enabled, transactional commits will be parallelized with transactional writes", 28 true, 29 ) 30 31 // txnCommitter is a txnInterceptor that concerns itself with committing and 32 // rolling back transactions. It intercepts EndTxn requests and coordinates 33 // their execution. This is accomplished either by issuing them directly with 34 // proper addressing if they are alone, eliding them if they are not needed, or 35 // coordinating their execution in parallel with the rest of their batch if they 36 // are part of a larger set of requests. 37 // 38 // The third operation listed, which we define as a "parallel commit", is the 39 // most interesting. Marking a transaction record as committed in parallel with 40 // writing the rest of the transaction's intents is a clear win in terms of 41 // latency - in theory it removes the cost of an entire consensus round-trip 42 // from a transaction. However, doing so safely comes with extra complication. 43 // It requires an extension to the transaction model, additional client-side 44 // logic, buy-in from concurrency control, and specialized support from a 45 // transaction recovery mechanism. txnCommitter is responsible for the parallel 46 // commit-specific client-side logic. 47 // 48 // Parallel commits works by defining a committed transaction as a transaction 49 // that meets one of the two following commit conditions: 50 // 1. a transaction is *explicitly committed* if it has a transaction record with 51 // a COMMITTED status 52 // 2. a transaction is *implicitly committed* if it has a transaction record with 53 // a STAGING status and intents written for all writes declared as "in-flight" 54 // on the transaction record at equal or lower timestamps than the transaction 55 // record's commit timestamp 56 // 57 // A transaction may move from satisfying the implicit commit condition to 58 // satisfying the explicit commit condition. This is desirable because it moves 59 // the commit condition from a distributed condition to one local to the 60 // transaction record. Regardless, once either commit condition is satisfied, a 61 // transaction will remain committed in perpetuity both to itself and to all 62 // concurrent observers. 63 // 64 // The txnCommitter interceptor's role in this is to determine the set of writes 65 // that will be in-flight during a parallel commit. It collects this set from 66 // both the writes and the query intent requests that it finds present in the 67 // same batch as the committing end transaction request. The writes in this 68 // batch indicate a new intent write and the query intent requests indicate a 69 // previous pipelined intent write that has not yet been proven as successful. 70 // Before issuing the batch, the txnCommitter attaches this set to the end 71 // transaction request. 72 // 73 // The txnCommitter then collects the response of the batch when it returns. 74 // Based on the outcome of the requests in the batch, the interceptor determines 75 // whether the transaction successfully committed by satisfying the implicit 76 // commit condition. 77 // 78 // If all requests in the batch succeeded (including the EndTxn request) then 79 // the implicit commit condition is satisfied. The interceptor returns a 80 // successful response up then stack and launches an async task to make the 81 // commit explicit by moving the transaction record's status from STAGING to 82 // COMMITTED. 83 // 84 // If all requests did not succeed then the implicit commit condition is not 85 // satisfied and the transaction is still in-progress (and could still be 86 // committed or aborted at a later time). There are a number of reasons why 87 // some of the requests in the final batch may have failed: 88 // - intent writes: these requests may fail to write an intent due to a logical 89 // error like a ConditionFailedError. They also could have succeeded at writing 90 // an intent but failed to write it at the desired timestamp because they ran 91 // into the timestamp cache or another committed value. In the first case, the 92 // txnCommitter will receive an error. In the second, it will generate one in 93 // needTxnRetryAfterStaging. 94 // - query intents: these requests may fail because they discover that one of the 95 // previously issued writes has failed; either because it never left an intent 96 // or because it left one at too high of a timestamp. In this case, the request 97 // will return an error because the requests all have the ErrorIfMissing option 98 // set. It will also prevent the write from ever succeeding in the future, which 99 // ensures that the transaction will never suddenly become implicitly committed 100 // at a later point due to the write eventually succeeding (e.g. after a replay). 101 // - end txn: this request may fail with a TransactionRetryError for any number of 102 // reasons, such as if the transaction's provisional commit timestamp has been 103 // pushed past its read timestamp. In all of these cases, an error will be 104 // returned and the transaction record will not be staged. 105 // 106 // If it is unknown whether all of the requests in the final batch succeeded 107 // (e.g. due to a network error) then an AmbiguousResultError is returned. The 108 // logic to enforce this is in DistSender. 109 // 110 // In all cases, the interceptor abstracts away the details of this from all 111 // interceptors above it in the coordinator interceptor stack. 112 type txnCommitter struct { 113 st *cluster.Settings 114 stopper *stop.Stopper 115 wrapped lockedSender 116 mu sync.Locker 117 } 118 119 // SendLocked implements the lockedSender interface. 120 func (tc *txnCommitter) SendLocked( 121 ctx context.Context, ba roachpb.BatchRequest, 122 ) (*roachpb.BatchResponse, *roachpb.Error) { 123 // If the batch does not include an EndTxn request, pass it through. 124 rArgs, hasET := ba.GetArg(roachpb.EndTxn) 125 if !hasET { 126 return tc.wrapped.SendLocked(ctx, ba) 127 } 128 et := rArgs.(*roachpb.EndTxnRequest) 129 130 // Determine whether we can elide the EndTxn entirely. We can do so if the 131 // transaction is read-only, which we determine based on whether the EndTxn 132 // request contains any writes. 133 if len(et.LockSpans) == 0 && len(et.InFlightWrites) == 0 { 134 return tc.sendLockedWithElidedEndTxn(ctx, ba, et) 135 } 136 137 // Assign the transaction's key to the Request's header if it isn't already 138 // set. This is the only place where EndTxnRequest.Key is assigned, but we 139 // could be dealing with a re-issued batch after a refresh. Remember, the 140 // committer is below the span refresh on the interceptor stack. 141 var etAttempt endTxnAttempt 142 if et.Key == nil { 143 et.Key = ba.Txn.Key 144 etAttempt = endTxnFirstAttempt 145 } else { 146 // If this is a retry, we'll disable parallel commit. Since the previous 147 // attempt might have partially succeeded (i.e. the batch might have been 148 // split into sub-batches and some of them might have evaluated 149 // successfully), there might be intents laying around. If we'd perform a 150 // parallel commit, and the batch gets split again, and the STAGING txn 151 // record were written before we evaluate some of the other sub-batche. We 152 // could technically enter the "implicitly committed" state before all the 153 // sub-batches are evaluated and this is problematic: there's a race between 154 // evaluating those requests and other pushers coming along and 155 // transitioning the txn to explicitly committed (and cleaning up all the 156 // intents), and the evaluations of the outstanding sub-batches. If the 157 // randos win, then the re-evaluations will fail because we don't have 158 // idempotency of evaluations across a txn commit (for example, the 159 // re-evaluations might notice that their transaction is already committed 160 // and get confused). 161 etAttempt = endTxnRetry 162 if len(et.InFlightWrites) > 0 { 163 // Make a copy of the EndTxn, since we're going to change it below to 164 // disable the parallel commit. 165 etCpy := *et 166 ba.Requests[len(ba.Requests)-1].SetInner(&etCpy) 167 et = &etCpy 168 } 169 } 170 171 // Determine whether the commit request can be run in parallel with the rest 172 // of the requests in the batch. If not, move the in-flight writes currently 173 // attached to the EndTxn request to the LockSpans and clear the in-flight 174 // write set; no writes will be in-flight concurrently with the EndTxn 175 // request. 176 if len(et.InFlightWrites) > 0 && !tc.canCommitInParallel(ctx, ba, et, etAttempt) { 177 // NB: when parallel commits is disabled, this is the best place to 178 // detect whether the batch has only distinct spans. We can set this 179 // flag based on whether any of previously declared in-flight writes 180 // in this batch overlap with each other. This will have (rare) false 181 // negatives when the in-flight writes overlap with existing lock 182 // spans, but never false positives. 183 et.LockSpans, ba.Header.DistinctSpans = mergeIntoSpans(et.LockSpans, et.InFlightWrites) 184 // Disable parallel commits. 185 et.InFlightWrites = nil 186 } 187 188 // If the EndTxn request is a rollback, pass it through. 189 if !et.Commit { 190 return tc.wrapped.SendLocked(ctx, ba) 191 } 192 193 // Send the adjusted batch through the wrapped lockedSender. Unlocks while 194 // sending then re-locks. 195 br, pErr := tc.wrapped.SendLocked(ctx, ba) 196 if pErr != nil { 197 // If the batch resulted in an error but the EndTxn request succeeded, 198 // staging the transaction record in the process, downgrade the status 199 // back to PENDING. Even though the transaction record may have a status 200 // of STAGING, we know that the transaction failed to implicitly commit, 201 // so interceptors above the txnCommitter in the stack don't need to be 202 // made aware that the record is staging. 203 if txn := pErr.GetTxn(); txn != nil && txn.Status == roachpb.STAGING { 204 pErr.SetTxn(cloneWithStatus(txn, roachpb.PENDING)) 205 } 206 return nil, pErr 207 } 208 209 // Determine next steps based on the status of the transaction. 210 switch br.Txn.Status { 211 case roachpb.STAGING: 212 // Continue with STAGING-specific validation and cleanup. 213 case roachpb.COMMITTED: 214 // The transaction is explicitly committed. This is possible if all 215 // in-flight writes were sent to the same range as the EndTxn request, 216 // in a single batch. In this case, a range can determine that all 217 // in-flight writes will succeed with the EndTxn and can decide to skip 218 // the STAGING state. 219 // 220 // This is also possible if we never attached any in-flight writes to 221 // the EndTxn request, either because canCommitInParallel returned false 222 // or because there were no unproven in-flight writes (see txnPipeliner) 223 // and there were no writes in the batch request. 224 return br, nil 225 default: 226 return nil, roachpb.NewErrorf("unexpected response status without error: %v", br.Txn) 227 } 228 229 // Determine whether the transaction needs to either retry or refresh. When 230 // the EndTxn request evaluated while STAGING the transaction record, it 231 // performed this check. However, the transaction proto may have changed due 232 // to writes evaluated concurrently with the EndTxn even if none of those 233 // writes returned an error. Remember that the transaction proto we see here 234 // could be a combination of protos from responses, all merged by 235 // DistSender. 236 if pErr := needTxnRetryAfterStaging(br); pErr != nil { 237 log.VEventf(ctx, 2, "parallel commit failed since some writes were pushed. "+ 238 "Synthesized err: %s", pErr) 239 return nil, pErr 240 } 241 242 // If the transaction doesn't need to retry then it is implicitly committed! 243 // We're the only ones who know that though -- other concurrent transactions 244 // will need to go through the full status resolution process to make a 245 // determination about the status of our STAGING transaction. To avoid this, 246 // we transition to an explicitly committed transaction as soon as possible. 247 // This also has the side-effect of kicking off intent resolution. 248 mergedLockSpans, _ := mergeIntoSpans(et.LockSpans, et.InFlightWrites) 249 tc.makeTxnCommitExplicitAsync(ctx, br.Txn, mergedLockSpans, ba.CanForwardReadTimestamp) 250 251 // Switch the status on the batch response's transaction to COMMITTED. No 252 // interceptor above this one in the stack should ever need to deal with 253 // transaction proto in the STAGING state. 254 br.Txn = cloneWithStatus(br.Txn, roachpb.COMMITTED) 255 return br, nil 256 } 257 258 // sendLockedWithElidedEndTxn sends the provided batch without its EndTxn 259 // request. However, if the EndTxn request is alone in the batch, nothing will 260 // be sent at all. Either way, the result of the EndTxn will be synthesized and 261 // returned in the batch response. 262 // 263 // The method is used for read-only transactions, which never need to write a 264 // transaction record. 265 func (tc *txnCommitter) sendLockedWithElidedEndTxn( 266 ctx context.Context, ba roachpb.BatchRequest, et *roachpb.EndTxnRequest, 267 ) (br *roachpb.BatchResponse, pErr *roachpb.Error) { 268 // Send the batch without its final request, which we know to be the EndTxn 269 // request that we're eliding. If this would result in us sending an empty 270 // batch, mock out a reply instead of sending anything. 271 ba.Requests = ba.Requests[:len(ba.Requests)-1] 272 if len(ba.Requests) > 0 { 273 br, pErr = tc.wrapped.SendLocked(ctx, ba) 274 if pErr != nil { 275 return nil, pErr 276 } 277 } else { 278 br = &roachpb.BatchResponse{} 279 // NB: there's no need to clone the txn proto here because we already 280 // call cloneWithStatus below. 281 br.Txn = ba.Txn 282 } 283 284 // Check if the (read-only) txn was pushed above its deadline. 285 deadline := et.Deadline 286 if deadline != nil && deadline.LessEq(br.Txn.WriteTimestamp) { 287 return nil, generateTxnDeadlineExceededErr(ba.Txn, *deadline) 288 } 289 290 // Update the response's transaction proto. This normally happens on the 291 // server and is sent back in response headers, but in this case the EndTxn 292 // request was optimized away. The caller may still inspect the transaction 293 // struct, so we manually update it here to emulate a true transaction. 294 status := roachpb.ABORTED 295 if et.Commit { 296 status = roachpb.COMMITTED 297 } 298 br.Txn = cloneWithStatus(br.Txn, status) 299 300 // Synthesize and append an EndTxn response. 301 br.Add(&roachpb.EndTxnResponse{}) 302 return br, nil 303 } 304 305 // endTxnAttempt specifies whether it's the first time that we're attempting to 306 // evaluate an EndTxn request or whether it's a retry (i.e. after a successful 307 // refresh). There are some precautions we need to take when sending out 308 // retries. 309 type endTxnAttempt int 310 311 const ( 312 endTxnFirstAttempt endTxnAttempt = iota 313 endTxnRetry 314 ) 315 316 // canCommitInParallel determines whether the batch can issue its committing 317 // EndTxn in parallel with the rest of its requests and with any in-flight 318 // writes, which all should have corresponding QueryIntent requests in the 319 // batch. 320 func (tc *txnCommitter) canCommitInParallel( 321 ctx context.Context, ba roachpb.BatchRequest, et *roachpb.EndTxnRequest, etAttempt endTxnAttempt, 322 ) bool { 323 if !parallelCommitsEnabled.Get(&tc.st.SV) { 324 return false 325 } 326 327 if etAttempt == endTxnRetry { 328 log.VEventf(ctx, 2, "retrying batch not eligible for parallel commit") 329 return false 330 } 331 332 // We're trying to parallel commit, not parallel abort. 333 if !et.Commit { 334 return false 335 } 336 337 // If the transaction has a commit trigger, we don't allow it to commit in 338 // parallel with writes. There's no fundamental reason for this restriction, 339 // but for now it's not worth the complication. 340 if et.InternalCommitTrigger != nil { 341 return false 342 } 343 344 // Check whether every request in the batch is compatable with a parallel 345 // commit. If any are incompatible then we cannot perform a parallel commit. 346 // We ignore the last request in the slice because we know it is the EndTxn. 347 for _, ru := range ba.Requests[:len(ba.Requests)-1] { 348 req := ru.GetInner() 349 switch { 350 case roachpb.IsIntentWrite(req): 351 if roachpb.IsRange(req) { 352 // Similar to how we can't pipeline ranged writes, we also can't 353 // commit in parallel with them. The reason for this is that the 354 // status resolution process for STAGING transactions wouldn't 355 // know where to look for the corresponding intents. 356 return false 357 } 358 // All other point writes are included in the EndTxn request's 359 // InFlightWrites set and are visible to the status resolution 360 // process for STAGING transactions. Populating InFlightWrites 361 // has already been done by the txnPipeliner. 362 363 case req.Method() == roachpb.QueryIntent: 364 // QueryIntent requests are compatable with parallel commits. The 365 // intents being queried are also attached to the EndTxn request's 366 // InFlightWrites set and are visible to the status resolution 367 // process for STAGING transactions. Populating InFlightWrites has 368 // already been done by the txnPipeliner. 369 370 default: 371 // All other request types, notably Get and Scan requests, are 372 // incompatible with parallel commits because their outcome is 373 // not taken into consideration by the status resolution process 374 // for STAGING transactions. 375 return false 376 } 377 } 378 return true 379 } 380 381 // mergeIntoSpans merges all provided sequenced writes into the span slice. It 382 // then sorts the spans and merges an that overlap. The function does not mutate 383 // the provided span slice. Returns true iff all of the spans are distinct. 384 func mergeIntoSpans(s []roachpb.Span, ws []roachpb.SequencedWrite) ([]roachpb.Span, bool) { 385 m := make([]roachpb.Span, len(s)+len(ws)) 386 copy(m, s) 387 for i, w := range ws { 388 m[len(s)+i] = roachpb.Span{Key: w.Key} 389 } 390 return roachpb.MergeSpans(m) 391 } 392 393 // needTxnRetryAfterStaging determines whether the transaction needs to refresh 394 // (see txnSpanRefresher) or retry based on the batch response of a parallel 395 // commit attempt. 396 func needTxnRetryAfterStaging(br *roachpb.BatchResponse) *roachpb.Error { 397 if len(br.Responses) == 0 { 398 return roachpb.NewErrorf("no responses in BatchResponse: %v", br) 399 } 400 lastResp := br.Responses[len(br.Responses)-1].GetInner() 401 etResp, ok := lastResp.(*roachpb.EndTxnResponse) 402 if !ok { 403 return roachpb.NewErrorf("unexpected response in BatchResponse: %v", lastResp) 404 } 405 if etResp.StagingTimestamp.IsEmpty() { 406 return roachpb.NewErrorf("empty StagingTimestamp in EndTxnResponse: %v", etResp) 407 } 408 if etResp.StagingTimestamp.Less(br.Txn.WriteTimestamp) { 409 // If the timestamp that the transaction record was staged at 410 // is less than the timestamp of the transaction in the batch 411 // response then one of the concurrent writes was pushed to 412 // a higher timestamp. This violates the "implicit commit" 413 // condition and neither the transaction coordinator nor any 414 // other concurrent actor will consider this transaction to 415 // be committed as is. 416 // Note that we leave the transaction record that we wrote in the STAGING 417 // state, which is not ideal. But as long as we continue heartbeating the 418 // txn record, it being PENDING or STAGING does not make a difference. 419 reason := roachpb.RETRY_SERIALIZABLE 420 if br.Txn.WriteTooOld { 421 reason = roachpb.RETRY_WRITE_TOO_OLD 422 } 423 err := roachpb.NewTransactionRetryError( 424 reason, "serializability failure concurrent with STAGING") 425 txn := cloneWithStatus(br.Txn, roachpb.PENDING) 426 return roachpb.NewErrorWithTxn(err, txn) 427 } 428 return nil 429 } 430 431 // makeTxnCommitExplicitAsync launches an async task that attempts to move the 432 // transaction from implicitly committed (STAGING status with all intents 433 // written) to explicitly committed (COMMITTED status). It does so by sending a 434 // second EndTxnRequest, this time with no InFlightWrites attached. 435 func (tc *txnCommitter) makeTxnCommitExplicitAsync( 436 ctx context.Context, txn *roachpb.Transaction, lockSpans []roachpb.Span, canFwdRTS bool, 437 ) { 438 // TODO(nvanbenschoten): consider adding tracing for this request. 439 // TODO(nvanbenschoten): add a timeout to this request. 440 // TODO(nvanbenschoten): consider making this semi-synchronous to 441 // backpressure client writes when these start to slow down. This 442 // would be similar to what we do for intent resolution. 443 log.VEventf(ctx, 2, "making txn commit explicit: %s", txn) 444 if err := tc.stopper.RunAsyncTask( 445 context.Background(), "txnCommitter: making txn commit explicit", func(ctx context.Context) { 446 tc.mu.Lock() 447 defer tc.mu.Unlock() 448 if err := makeTxnCommitExplicitLocked(ctx, tc.wrapped, txn, lockSpans, canFwdRTS); err != nil { 449 log.Errorf(ctx, "making txn commit explicit failed for %s: %v", txn, err) 450 } 451 }, 452 ); err != nil { 453 log.VErrEventf(ctx, 1, "failed to make txn commit explicit: %v", err) 454 } 455 } 456 457 func makeTxnCommitExplicitLocked( 458 ctx context.Context, 459 s lockedSender, 460 txn *roachpb.Transaction, 461 lockSpans []roachpb.Span, 462 canFwdRTS bool, 463 ) error { 464 // Clone the txn to prevent data races. 465 txn = txn.Clone() 466 467 // Construct a new batch with just an EndTxn request. 468 ba := roachpb.BatchRequest{} 469 ba.Header = roachpb.Header{Txn: txn, CanForwardReadTimestamp: canFwdRTS} 470 et := roachpb.EndTxnRequest{Commit: true} 471 et.Key = txn.Key 472 et.LockSpans = lockSpans 473 et.CanCommitAtHigherTimestamp = canFwdRTS 474 ba.Add(&et) 475 476 _, pErr := s.SendLocked(ctx, ba) 477 if pErr != nil { 478 switch t := pErr.GetDetail().(type) { 479 case *roachpb.TransactionStatusError: 480 // Detect whether the error indicates that someone else beat 481 // us to explicitly committing the transaction record. 482 if t.Reason == roachpb.TransactionStatusError_REASON_TXN_COMMITTED { 483 return nil 484 } 485 case *roachpb.TransactionRetryError: 486 logFunc := log.Errorf 487 if util.RaceEnabled { 488 logFunc = log.Fatalf 489 } 490 logFunc(ctx, "unexpected retry error when making commit explicit for %s: %v", txn, t) 491 } 492 return pErr.GoError() 493 } 494 return nil 495 } 496 497 // setWrapped implements the txnInterceptor interface. 498 func (tc *txnCommitter) setWrapped(wrapped lockedSender) { tc.wrapped = wrapped } 499 500 // populateLeafInputState is part of the txnInterceptor interface. 501 func (*txnCommitter) populateLeafInputState(*roachpb.LeafTxnInputState) {} 502 503 // populateLeafFinalState is part of the txnInterceptor interface. 504 func (*txnCommitter) populateLeafFinalState(*roachpb.LeafTxnFinalState) {} 505 506 // importLeafFinalState is part of the txnInterceptor interface. 507 func (*txnCommitter) importLeafFinalState(context.Context, *roachpb.LeafTxnFinalState) {} 508 509 // epochBumpedLocked implements the txnReqInterceptor interface. 510 func (tc *txnCommitter) epochBumpedLocked() {} 511 512 // createSavepointLocked is part of the txnReqInterceptor interface. 513 func (*txnCommitter) createSavepointLocked(context.Context, *savepoint) {} 514 515 // rollbackToSavepointLocked is part of the txnReqInterceptor interface. 516 func (*txnCommitter) rollbackToSavepointLocked(context.Context, savepoint) {} 517 518 // closeLocked implements the txnReqInterceptor interface. 519 func (tc *txnCommitter) closeLocked() {} 520 521 func cloneWithStatus(txn *roachpb.Transaction, s roachpb.TransactionStatus) *roachpb.Transaction { 522 clone := txn.Clone() 523 clone.Status = s 524 return clone 525 }