github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_write.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "fmt" 16 "time" 17 18 "github.com/cockroachdb/cockroach/pkg/base" 19 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval" 20 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result" 21 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency" 23 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 24 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 25 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset" 26 "github.com/cockroachdb/cockroach/pkg/roachpb" 27 "github.com/cockroachdb/cockroach/pkg/storage" 28 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 29 "github.com/cockroachdb/cockroach/pkg/util" 30 "github.com/cockroachdb/cockroach/pkg/util/hlc" 31 "github.com/cockroachdb/cockroach/pkg/util/log" 32 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 33 "github.com/cockroachdb/errors" 34 "go.etcd.io/etcd/raft" 35 ) 36 37 // executeWriteBatch is the entry point for client requests which may mutate the 38 // range's replicated state. Requests taking this path are evaluated and ultimately 39 // serialized through Raft, but pass through additional machinery whose goal is 40 // to allow commands which commute to be proposed in parallel. The naive 41 // alternative, submitting requests to Raft one after another, paying massive 42 // latency, is only taken for commands whose effects may overlap. 43 // 44 // Concretely, 45 // 46 // - The timestamp cache is checked to determine if the command's affected keys 47 // were accessed with a timestamp exceeding that of the command; if so, the 48 // command's timestamp is incremented accordingly. 49 // - A RaftCommand is constructed. If proposer-evaluated KV is active, 50 // the request is evaluated and the Result is placed in the 51 // RaftCommand. If not, the request itself is added to the command. 52 // - The proposal is inserted into the Replica's in-flight proposals map, 53 // a lease index is assigned to it, and it is submitted to Raft, returning 54 // a channel. 55 // - The result of the Raft proposal is read from the channel and the command 56 // registered with the timestamp cache, its latches are released, and 57 // its result (which could be an error) is returned to the client. 58 // 59 // Returns either a response or an error, along with the provided concurrency 60 // guard if it is passing ownership back to the caller of the function. 61 // 62 // NB: changing BatchRequest to a pointer here would have to be done cautiously 63 // as this method makes the assumption that it operates on a shallow copy (see 64 // call to applyTimestampCache). 65 func (r *Replica) executeWriteBatch( 66 ctx context.Context, ba *roachpb.BatchRequest, st kvserverpb.LeaseStatus, g *concurrency.Guard, 67 ) (br *roachpb.BatchResponse, _ *concurrency.Guard, pErr *roachpb.Error) { 68 startTime := timeutil.Now() 69 70 // TODO(nvanbenschoten): unlike on the read-path (executeReadOnlyBatch), we 71 // don't synchronize with r.readOnlyCmdMu here. Is that ok? What if the 72 // replica is destroyed concurrently with a write? We won't be able to 73 // successfully propose as the lease will presumably have changed, but what 74 // if we hit an error during evaluation (e.g. a ConditionFailedError)? 75 76 // Verify that the batch can be executed. 77 // NB: we only need to check that the request is in the Range's key bounds 78 // at proposal time, not at application time, because the spanlatch manager 79 // will synchronize all requests (notably EndTxn with SplitTrigger) that may 80 // cause this condition to change. 81 if err := r.checkExecutionCanProceed(ba, g, &st); err != nil { 82 return nil, g, roachpb.NewError(err) 83 } 84 85 minTS, untrack := r.store.cfg.ClosedTimestamp.Tracker.Track(ctx) 86 defer untrack(ctx, 0, 0, 0) // covers all error returns below 87 88 // Examine the timestamp cache for preceding commands which require this 89 // command to move its timestamp forward. Or, in the case of a transactional 90 // write, the txn timestamp and possible write-too-old bool. 91 if bumped := r.applyTimestampCache(ctx, ba, minTS); bumped { 92 // If we bump the transaction's timestamp, we must absolutely 93 // tell the client in a response transaction (for otherwise it 94 // doesn't know about the incremented timestamp). Response 95 // transactions are set far away from this code, but at the time 96 // of writing, they always seem to be set. Since that is a 97 // likely target of future micro-optimization, this assertion is 98 // meant to protect against future correctness anomalies. 99 defer func() { 100 if br != nil && ba.Txn != nil && br.Txn == nil { 101 log.Fatalf(ctx, "assertion failed: transaction updated by "+ 102 "timestamp cache, but transaction returned in response; "+ 103 "updated timestamp would have been lost (recovered): "+ 104 "%s in batch %s", ba.Txn, ba, 105 ) 106 } 107 }() 108 } 109 log.Event(ctx, "applied timestamp cache") 110 111 // Checking the context just before proposing can help avoid ambiguous errors. 112 if err := ctx.Err(); err != nil { 113 log.VEventf(ctx, 2, "%s before proposing: %s", err, ba.Summary()) 114 return nil, g, roachpb.NewError(errors.Wrap(err, "aborted before proposing")) 115 } 116 117 // Check that the lease is still valid before proposing to avoid discovering 118 // this after replication and potentially missing out on the chance to retry 119 // if the request is using AsyncConsensus. This is best-effort, but can help 120 // in cases where the request waited arbitrarily long for locks acquired by 121 // other transactions to be released while sequencing in the concurrency 122 // manager. 123 if curLease, _ := r.GetLease(); curLease.Sequence > st.Lease.Sequence { 124 curLeaseCpy := curLease // avoid letting curLease escape 125 err := newNotLeaseHolderError(&curLeaseCpy, r.store.StoreID(), r.Desc()) 126 log.VEventf(ctx, 2, "%s before proposing: %s", err, ba.Summary()) 127 return nil, g, roachpb.NewError(err) 128 } 129 130 // If the command is proposed to Raft, ownership of and responsibility for 131 // the concurrency guard will be assumed by Raft, so provide the guard to 132 // evalAndPropose. 133 ch, abandon, maxLeaseIndex, pErr := r.evalAndPropose(ctx, ba, g, &st.Lease) 134 if pErr != nil { 135 if maxLeaseIndex != 0 { 136 log.Fatalf( 137 ctx, "unexpected max lease index %d assigned to failed proposal: %s, error %s", 138 maxLeaseIndex, ba, pErr, 139 ) 140 } 141 return nil, g, pErr 142 } 143 g = nil // ownership passed to Raft, prevent misuse 144 145 // A max lease index of zero is returned when no proposal was made or a lease was proposed. 146 // In both cases, we don't need to communicate a MLAI. Furthermore, for lease proposals we 147 // cannot communicate under the lease's epoch. Instead the code calls EmitMLAI explicitly 148 // as a side effect of stepping up as leaseholder. 149 if maxLeaseIndex != 0 { 150 untrack(ctx, ctpb.Epoch(st.Lease.Epoch), r.RangeID, ctpb.LAI(maxLeaseIndex)) 151 } 152 153 // If the command was accepted by raft, wait for the range to apply it. 154 ctxDone := ctx.Done() 155 shouldQuiesce := r.store.stopper.ShouldQuiesce() 156 startPropTime := timeutil.Now() 157 slowTimer := timeutil.NewTimer() 158 defer slowTimer.Stop() 159 slowTimer.Reset(base.SlowRequestThreshold) 160 // NOTE: this defer was moved from a case in the select statement to here 161 // because escape analysis does a better job avoiding allocations to the 162 // heap when defers are unconditional. When this was in the slowTimer select 163 // case, it was causing pErr to escape. 164 defer func() { 165 if slowTimer.Read { 166 r.store.metrics.SlowRaftRequests.Dec(1) 167 log.Infof( 168 ctx, 169 "slow command %s finished after %.2fs with error %v", 170 ba, 171 timeutil.Since(startPropTime).Seconds(), 172 pErr, 173 ) 174 } 175 }() 176 177 for { 178 select { 179 case propResult := <-ch: 180 // Semi-synchronously process any intents that need resolving here in 181 // order to apply back pressure on the client which generated them. The 182 // resolution is semi-synchronous in that there is a limited number of 183 // outstanding asynchronous resolution tasks allowed after which 184 // further calls will block. 185 if len(propResult.EncounteredIntents) > 0 { 186 // TODO(peter): Re-proposed and canceled (but executed) commands can 187 // both leave intents to GC that don't hit this code path. No good 188 // solution presents itself at the moment and such intents will be 189 // resolved on reads. 190 if err := r.store.intentResolver.CleanupIntentsAsync( 191 ctx, propResult.EncounteredIntents, true, /* allowSync */ 192 ); err != nil { 193 log.Warningf(ctx, "%v", err) 194 } 195 } 196 if len(propResult.EndTxns) > 0 { 197 if err := r.store.intentResolver.CleanupTxnIntentsAsync( 198 ctx, r.RangeID, propResult.EndTxns, true, /* allowSync */ 199 ); err != nil { 200 log.Warningf(ctx, "%v", err) 201 } 202 } 203 return propResult.Reply, nil, propResult.Err 204 case <-slowTimer.C: 205 slowTimer.Read = true 206 r.store.metrics.SlowRaftRequests.Inc(1) 207 208 log.Errorf(ctx, "range unavailable: %v", 209 rangeUnavailableMessage(r.Desc(), r.store.cfg.NodeLiveness.GetIsLiveMap(), 210 r.RaftStatus(), ba, timeutil.Since(startPropTime))) 211 case <-ctxDone: 212 // If our context was canceled, return an AmbiguousResultError, 213 // which indicates to the caller that the command may have executed. 214 abandon() 215 log.VEventf(ctx, 2, "context cancellation after %0.1fs of attempting command %s", 216 timeutil.Since(startTime).Seconds(), ba) 217 return nil, nil, roachpb.NewError(roachpb.NewAmbiguousResultError(ctx.Err().Error())) 218 case <-shouldQuiesce: 219 // If shutting down, return an AmbiguousResultError, which indicates 220 // to the caller that the command may have executed. 221 abandon() 222 log.VEventf(ctx, 2, "shutdown cancellation after %0.1fs of attempting command %s", 223 timeutil.Since(startTime).Seconds(), ba) 224 return nil, nil, roachpb.NewError(roachpb.NewAmbiguousResultError("server shutdown")) 225 } 226 } 227 } 228 229 func rangeUnavailableMessage( 230 desc *roachpb.RangeDescriptor, 231 lm IsLiveMap, 232 rs *raft.Status, 233 ba *roachpb.BatchRequest, 234 dur time.Duration, 235 ) string { 236 cpy := *desc 237 desc = &cpy 238 desc.StartKey, desc.EndKey = nil, nil // scrub PII 239 240 var liveReplicas, otherReplicas []roachpb.ReplicaDescriptor 241 for _, rDesc := range desc.Replicas().All() { 242 if lm[rDesc.NodeID].IsLive { 243 liveReplicas = append(liveReplicas, rDesc) 244 } else { 245 otherReplicas = append(otherReplicas, rDesc) 246 } 247 } 248 return fmt.Sprintf(`have been waiting %.2fs for proposing command %s. 249 This range is likely unavailable. 250 Please submit this message to Cockroach Labs support along with the following information: 251 252 Descriptor: %s 253 Live: %s 254 Non-live: %s 255 Raft Status: %+v 256 257 and a copy of https://yourhost:8080/#/reports/range/%d 258 259 If you are using CockroachDB Enterprise, reach out through your 260 support contract. Otherwise, please open an issue at: 261 262 https://github.com/cockroachdb/cockroach/issues/new/choose 263 `, 264 dur.Seconds(), 265 ba, 266 desc, 267 roachpb.MakeReplicaDescriptors(liveReplicas), 268 roachpb.MakeReplicaDescriptors(otherReplicas), 269 rs, 270 desc.RangeID, 271 ) 272 } 273 274 // canAttempt1PCEvaluation looks at the batch and decides whether it can be 275 // executed as 1PC. 276 func (r *Replica) canAttempt1PCEvaluation( 277 ctx context.Context, ba *roachpb.BatchRequest, latchSpans *spanset.SpanSet, 278 ) (bool, *roachpb.Error) { 279 if !isOnePhaseCommit(ba) { 280 return false, nil 281 } 282 283 if ba.Timestamp != ba.Txn.WriteTimestamp { 284 log.Fatalf(ctx, "unexpected 1PC execution with diverged timestamp. %s != %s", 285 ba.Timestamp, ba.Txn.WriteTimestamp) 286 } 287 288 // The EndTxn checks whether the txn record can be created, but we're 289 // eliding the EndTxn. So, we'll do the check instead. 290 ok, minCommitTS, reason := r.CanCreateTxnRecord(ba.Txn.ID, ba.Txn.Key, ba.Txn.MinTimestamp) 291 if !ok { 292 newTxn := ba.Txn.Clone() 293 newTxn.Status = roachpb.ABORTED 294 return false, roachpb.NewErrorWithTxn(roachpb.NewTransactionAbortedError(reason), newTxn) 295 } 296 if ba.Timestamp.Less(minCommitTS) { 297 ba.Txn.WriteTimestamp = minCommitTS 298 // We can only evaluate at the new timestamp if we manage to bump the read 299 // timestamp. 300 return maybeBumpReadTimestampToWriteTimestamp(ctx, ba, latchSpans), nil 301 } 302 return true, nil 303 } 304 305 // evaluateWriteBatch evaluates the supplied batch. 306 // 307 // If the batch is transactional and has all the hallmarks of a 1PC commit (i.e. 308 // includes all intent writes & EndTxn, and there's nothing to suggest that the 309 // transaction will require retry or restart), the batch's txn is stripped and 310 // it's executed as an atomic batch write. If the writes cannot all be completed 311 // at the intended timestamp, the batch's txn is restored and it's re-executed 312 // in full. This allows it to lay down intents and return an appropriate 313 // retryable error. 314 func (r *Replica) evaluateWriteBatch( 315 ctx context.Context, 316 idKey kvserverbase.CmdIDKey, 317 ba *roachpb.BatchRequest, 318 latchSpans *spanset.SpanSet, 319 ) (storage.Batch, enginepb.MVCCStats, *roachpb.BatchResponse, result.Result, *roachpb.Error) { 320 log.Event(ctx, "executing read-write batch") 321 322 // If the transaction has been pushed but it can commit at the higher 323 // timestamp, let's evaluate the batch at the bumped timestamp. This will 324 // allow it commit, and also it'll allow us to attempt the 1PC code path. 325 maybeBumpReadTimestampToWriteTimestamp(ctx, ba, latchSpans) 326 327 // Attempt 1PC execution, if applicable. If not transactional or there are 328 // indications that the batch's txn will require retry, execute as normal. 329 ok, pErr := r.canAttempt1PCEvaluation(ctx, ba, latchSpans) 330 if pErr != nil { 331 return nil, enginepb.MVCCStats{}, nil, result.Result{}, pErr 332 } 333 if ok { 334 res := r.evaluate1PC(ctx, idKey, ba, latchSpans) 335 switch res.success { 336 case onePCSucceeded: 337 return res.batch, res.stats, res.br, res.res, nil 338 case onePCFailed: 339 if res.pErr == nil { 340 log.Fatalf(ctx, "1PC failed but no err. ba: %s", ba.String()) 341 } 342 return nil, enginepb.MVCCStats{}, nil, result.Result{}, res.pErr 343 case onePCFallbackToTransactionalEvaluation: 344 } 345 } 346 347 ms := new(enginepb.MVCCStats) 348 rec := NewReplicaEvalContext(r, latchSpans) 349 batch, br, res, pErr := r.evaluateWriteBatchWithServersideRefreshes( 350 ctx, idKey, rec, ms, ba, latchSpans, nil /* deadline */) 351 return batch, *ms, br, res, pErr 352 } 353 354 type onePCSuccess int 355 356 const ( 357 // onePCSucceeded means that the 1PC evaluation succeeded and the results should be 358 // returned to the client. 359 onePCSucceeded onePCSuccess = iota 360 // onePCFailed means that the 1PC evaluation failed and the attached error should be 361 // returned to the client. 362 onePCFailed 363 // onePCFallbackToTransactionalEvaluation means that 1PC evaluation failed, but 364 // regular transactional evaluation should be attempted. 365 onePCFallbackToTransactionalEvaluation 366 ) 367 368 type onePCResult struct { 369 success onePCSuccess 370 // pErr is set if success == onePCFailed. This is the error that should be 371 // returned to the client for this request. 372 pErr *roachpb.Error 373 374 // The fields below are only set when success == onePCSucceeded. 375 stats enginepb.MVCCStats 376 br *roachpb.BatchResponse 377 res result.Result 378 batch storage.Batch 379 } 380 381 // evaluate1PC attempts to evaluate the batch as a 1PC transaction - meaning it 382 // attempts to evaluate the batch as a non-transactional request. This is only 383 // possible if the batch contains all of the transaction's writes, which the 384 // caller needs to ensure. If successful, evaluating the batch this way is more 385 // efficient - we're avoiding writing the transaction record and writing and the 386 // immediately deleting intents. 387 func (r *Replica) evaluate1PC( 388 ctx context.Context, 389 idKey kvserverbase.CmdIDKey, 390 ba *roachpb.BatchRequest, 391 latchSpans *spanset.SpanSet, 392 ) (onePCRes onePCResult) { 393 log.VEventf(ctx, 2, "attempting 1PC execution") 394 395 var batch storage.Batch 396 defer func() { 397 // Close the batch unless it's passed to the caller (when the evaluation 398 // succeeds). 399 if onePCRes.success != onePCSucceeded { 400 batch.Close() 401 } 402 }() 403 404 // Try executing with transaction stripped. 405 strippedBa := *ba 406 strippedBa.Txn = nil 407 strippedBa.Requests = ba.Requests[:len(ba.Requests)-1] // strip end txn req 408 409 rec := NewReplicaEvalContext(r, latchSpans) 410 var br *roachpb.BatchResponse 411 var res result.Result 412 var pErr *roachpb.Error 413 414 arg, _ := ba.GetArg(roachpb.EndTxn) 415 etArg := arg.(*roachpb.EndTxnRequest) 416 canFwdTimestamp := batcheval.CanForwardCommitTimestampWithoutRefresh(ba.Txn, etArg) 417 418 // Evaluate strippedBa. If the transaction allows, permit refreshes. 419 ms := new(enginepb.MVCCStats) 420 if canFwdTimestamp { 421 batch, br, res, pErr = r.evaluateWriteBatchWithServersideRefreshes( 422 ctx, idKey, rec, ms, &strippedBa, latchSpans, etArg.Deadline) 423 } else { 424 batch, br, res, pErr = r.evaluateWriteBatchWrapper( 425 ctx, idKey, rec, ms, &strippedBa, latchSpans) 426 } 427 428 if pErr != nil || (!canFwdTimestamp && ba.Timestamp != br.Timestamp) { 429 if pErr != nil { 430 log.VEventf(ctx, 2, 431 "1PC execution failed, falling back to transactional execution. pErr: %v", pErr.String()) 432 } else { 433 log.VEventf(ctx, 2, 434 "1PC execution failed, falling back to transactional execution; the batch was pushed") 435 } 436 return onePCResult{success: onePCFallbackToTransactionalEvaluation} 437 } 438 439 // 1PC execution was successful, let's synthesize an EndTxnResponse. 440 441 clonedTxn := ba.Txn.Clone() 442 clonedTxn.Status = roachpb.COMMITTED 443 // Make sure the returned txn has the actual commit timestamp. This can be 444 // different from ba.Txn's if the stripped batch was evaluated at a bumped 445 // timestamp. 446 clonedTxn.ReadTimestamp = br.Timestamp 447 clonedTxn.WriteTimestamp = br.Timestamp 448 449 // If the end transaction is not committed, clear the batch and mark the status aborted. 450 if !etArg.Commit { 451 clonedTxn.Status = roachpb.ABORTED 452 batch.Close() 453 batch = r.store.Engine().NewBatch() 454 ms = new(enginepb.MVCCStats) 455 } else { 456 // Run commit trigger manually. 457 innerResult, err := batcheval.RunCommitTrigger(ctx, rec, batch, ms, etArg, clonedTxn) 458 if err != nil { 459 return onePCResult{ 460 success: onePCFailed, 461 pErr: roachpb.NewErrorf("failed to run commit trigger: %s", err), 462 } 463 } 464 if err := res.MergeAndDestroy(innerResult); err != nil { 465 return onePCResult{ 466 success: onePCFailed, 467 pErr: roachpb.NewError(err), 468 } 469 } 470 } 471 472 // Even though the transaction is 1PC and hasn't written any intents, it may 473 // have acquired unreplicated locks, so inform the concurrency manager that 474 // it is finalized and than any unreplicated locks that it has acquired can 475 // be released. 476 res.Local.UpdatedTxns = []*roachpb.Transaction{clonedTxn} 477 res.Local.ResolvedLocks = make([]roachpb.LockUpdate, len(etArg.LockSpans)) 478 for i, sp := range etArg.LockSpans { 479 res.Local.ResolvedLocks[i] = roachpb.LockUpdate{ 480 Span: sp, 481 Txn: clonedTxn.TxnMeta, 482 Status: clonedTxn.Status, 483 IgnoredSeqNums: clonedTxn.IgnoredSeqNums, 484 } 485 } 486 487 // Add placeholder responses for end transaction requests. 488 br.Add(&roachpb.EndTxnResponse{OnePhaseCommit: true}) 489 br.Txn = clonedTxn 490 return onePCResult{ 491 success: onePCSucceeded, 492 stats: *ms, 493 br: br, 494 res: res, 495 batch: batch, 496 } 497 } 498 499 // evaluateWriteBatchWithServersideRefreshes invokes evaluateBatch and retries 500 // at a higher timestamp in the event of some retriable errors if allowed by the 501 // batch/txn. 502 // 503 // deadline, if not nil, specifies the highest timestamp (exclusive) at which 504 // the request can be evaluated. If ba is a transactional request, then dealine 505 // cannot be specified; a transaction's deadline comes from it's EndTxn request. 506 func (r *Replica) evaluateWriteBatchWithServersideRefreshes( 507 ctx context.Context, 508 idKey kvserverbase.CmdIDKey, 509 rec batcheval.EvalContext, 510 ms *enginepb.MVCCStats, 511 ba *roachpb.BatchRequest, 512 latchSpans *spanset.SpanSet, 513 deadline *hlc.Timestamp, 514 ) (batch storage.Batch, br *roachpb.BatchResponse, res result.Result, pErr *roachpb.Error) { 515 goldenMS := *ms 516 for retries := 0; ; retries++ { 517 if retries > 0 { 518 log.VEventf(ctx, 2, "server-side retry of batch") 519 } 520 if batch != nil { 521 // Reset the stats. 522 *ms = goldenMS 523 batch.Close() 524 } 525 526 batch, br, res, pErr = r.evaluateWriteBatchWrapper(ctx, idKey, rec, ms, ba, latchSpans) 527 528 var success bool 529 if pErr == nil { 530 wto := br.Txn != nil && br.Txn.WriteTooOld 531 success = !wto 532 } else { 533 success = false 534 } 535 536 // If we can retry, set a higher batch timestamp and continue. 537 // Allow one retry only; a non-txn batch containing overlapping 538 // spans will always experience WriteTooOldError. 539 if success || retries > 0 || !canDoServersideRetry(ctx, pErr, ba, br, latchSpans, deadline) { 540 break 541 } 542 } 543 return batch, br, res, pErr 544 } 545 546 // evaluateWriteBatchWrapper is a wrapper on top of evaluateBatch() which deals 547 // with filling out result.LogicalOpLog. 548 func (r *Replica) evaluateWriteBatchWrapper( 549 ctx context.Context, 550 idKey kvserverbase.CmdIDKey, 551 rec batcheval.EvalContext, 552 ms *enginepb.MVCCStats, 553 ba *roachpb.BatchRequest, 554 latchSpans *spanset.SpanSet, 555 ) (storage.Batch, *roachpb.BatchResponse, result.Result, *roachpb.Error) { 556 batch, opLogger := r.newBatchedEngine(latchSpans) 557 br, res, pErr := evaluateBatch(ctx, idKey, batch, rec, ms, ba, false /* readOnly */) 558 if pErr == nil { 559 if opLogger != nil { 560 res.LogicalOpLog = &kvserverpb.LogicalOpLog{ 561 Ops: opLogger.LogicalOps(), 562 } 563 } 564 } 565 return batch, br, res, pErr 566 } 567 568 // newBatchedEngine creates an engine.Batch. Depending on whether rangefeeds 569 // are enabled, it also returns an engine.OpLoggerBatch. If non-nil, then this 570 // OpLogger is attached to the returned engine.Batch, recording all operations. 571 // Its recording should be attached to the Result of request evaluation. 572 func (r *Replica) newBatchedEngine(spans *spanset.SpanSet) (storage.Batch, *storage.OpLoggerBatch) { 573 batch := r.store.Engine().NewBatch() 574 var opLogger *storage.OpLoggerBatch 575 if r.isSystemRange() || RangefeedEnabled.Get(&r.store.cfg.Settings.SV) { 576 // TODO(nvanbenschoten): once we get rid of the RangefeedEnabled 577 // cluster setting we'll need a way to turn this on when any 578 // replica (not just the leaseholder) wants it and off when no 579 // replicas want it. This turns out to be pretty involved. 580 // 581 // The current plan is to: 582 // - create a range-id local key that stores all replicas that are 583 // subscribed to logical operations, along with their corresponding 584 // liveness epoch. 585 // - create a new command that adds or subtracts replicas from this 586 // structure. The command will be a write across the entire replica 587 // span so that it is serialized with all writes. 588 // - each replica will add itself to this set when it first needs 589 // logical ops. It will then wait until it sees the replicated command 590 // that added itself pop out through Raft so that it knows all 591 // commands that are missing logical ops are gone. 592 // - It will then proceed as normal, relying on the logical ops to 593 // always be included on the raft commands. When its no longer 594 // needs logical ops, it will remove itself from the set. 595 // - The leaseholder will have a new queue to detect registered 596 // replicas that are no longer live and remove them from the 597 // set to prevent "leaking" subscriptions. 598 // - The condition here to add logical logging will be: 599 // if len(replicaState.logicalOpsSubs) > 0 { ... } 600 // 601 // An alternative to this is the reduce the cost of the including 602 // the logical op log to a negligible amount such that it can be 603 // included on all raft commands, regardless of whether any replica 604 // has a rangefeed running or not. 605 // 606 // Another alternative is to make the setting table/zone-scoped 607 // instead of a fine-grained per-replica state. 608 opLogger = storage.NewOpLoggerBatch(batch) 609 batch = opLogger 610 } 611 if util.RaceEnabled { 612 // During writes we may encounter a versioned value newer than the request 613 // timestamp, and may have to retry at a higher timestamp. This is still 614 // safe as we're only ever writing at timestamps higher than the timestamp 615 // any write latch would be declared at. But because of this, we don't 616 // assert on access timestamps using spanset.NewBatchAt. 617 batch = spanset.NewBatch(batch, spans) 618 } 619 return batch, opLogger 620 } 621 622 // isOnePhaseCommit returns true iff the BatchRequest contains all writes in the 623 // transaction and ends with an EndTxn. One phase commits are disallowed if any 624 // of the following conditions are true: 625 // (1) the transaction has already been flagged with a write too old error 626 // (2) the transaction's commit timestamp has been forwarded 627 // (3) the transaction exceeded its deadline 628 // (4) the transaction is not in its first epoch and the EndTxn request does 629 // not require one phase commit. 630 func isOnePhaseCommit(ba *roachpb.BatchRequest) bool { 631 if ba.Txn == nil { 632 return false 633 } 634 if !ba.IsCompleteTransaction() { 635 return false 636 } 637 arg, _ := ba.GetArg(roachpb.EndTxn) 638 etArg := arg.(*roachpb.EndTxnRequest) 639 if retry, _, _ := batcheval.IsEndTxnTriggeringRetryError(ba.Txn, etArg); retry { 640 return false 641 } 642 // If the transaction has already restarted at least once then it may have 643 // left intents at prior epochs that need to be cleaned up during the 644 // process of committing the transaction. Even if the current epoch could 645 // perform a one phase commit, we don't allow it to because that could 646 // prevent it from properly resolving intents from prior epochs and cause 647 // it to abandon them instead. 648 // 649 // The exception to this rule is transactions that require a one phase 650 // commit. We know that if they also required a one phase commit in past 651 // epochs then they couldn't have left any intents that they now need to 652 // clean up. 653 return ba.Txn.Epoch == 0 || etArg.Require1PC 654 }