github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_batch_updates.go (about) 1 // Copyright 2020 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 16 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval" 17 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset" 18 "github.com/cockroachdb/cockroach/pkg/roachpb" 19 "github.com/cockroachdb/cockroach/pkg/util/hlc" 20 "github.com/cockroachdb/cockroach/pkg/util/log" 21 "github.com/cockroachdb/errors" 22 ) 23 24 // ---------------------------------------------------------------------------- 25 // This files contains functions performing updates to a BatchRequest performed 26 // on the server-side, specifically after the request has been routed to a 27 // replica (and thus the request has been split based on range boundaries). 28 // As per the client.Sender contract, these function need to consider the input 29 // batches as copy-on-write. 30 // ---------------------------------------------------------------------------- 31 32 // maybeStripInFlightWrites attempts to remove all point writes and query 33 // intents that ended up in the same batch as an EndTxn request from that EndTxn 34 // request's "in-flight" write set. The entire batch will commit atomically, so 35 // there is no need to consider the writes in the same batch concurrent. 36 // 37 // The transformation can lead to bypassing the STAGING state for a transaction 38 // entirely. This is possible if the function removes all of the in-flight 39 // writes from an EndTxn request that was committing in parallel with writes 40 // which all happened to be on the same range as the transaction record. 41 func maybeStripInFlightWrites(ba *roachpb.BatchRequest) (*roachpb.BatchRequest, error) { 42 args, hasET := ba.GetArg(roachpb.EndTxn) 43 if !hasET { 44 return ba, nil 45 } 46 47 et := args.(*roachpb.EndTxnRequest) 48 otherReqs := ba.Requests[:len(ba.Requests)-1] 49 if !et.IsParallelCommit() || len(otherReqs) == 0 { 50 return ba, nil 51 } 52 53 // Clone the BatchRequest and the EndTxn request before modifying it. We nil 54 // out the request's in-flight writes and make the lock spans immutable on 55 // append. Code below can use origET to recreate the in-flight write set if 56 // any elements remain in it. 57 origET := et 58 et = origET.ShallowCopy().(*roachpb.EndTxnRequest) 59 et.InFlightWrites = nil 60 et.LockSpans = et.LockSpans[:len(et.LockSpans):len(et.LockSpans)] // immutable 61 ba.Requests = append([]roachpb.RequestUnion(nil), ba.Requests...) 62 ba.Requests[len(ba.Requests)-1].MustSetInner(et) 63 64 // Fast-path: If we know that this batch contains all of the transaction's 65 // in-flight writes, then we can avoid searching in the in-flight writes set 66 // for each request. Instead, we can blindly merge all in-flight writes into 67 // the lock spans and clear out the in-flight writes set. 68 if len(otherReqs) >= len(origET.InFlightWrites) { 69 writes := 0 70 for _, ru := range otherReqs { 71 req := ru.GetInner() 72 switch { 73 case roachpb.IsIntentWrite(req) && !roachpb.IsRange(req): 74 // Concurrent point write. 75 writes++ 76 case req.Method() == roachpb.QueryIntent: 77 // Earlier pipelined point write that hasn't been proven yet. 78 writes++ 79 default: 80 // Ranged write or read. See below. 81 } 82 } 83 if len(origET.InFlightWrites) < writes { 84 return ba, errors.New("more write in batch with EndTxn than listed in in-flight writes") 85 } else if len(origET.InFlightWrites) == writes { 86 et.LockSpans = make([]roachpb.Span, len(origET.LockSpans)+len(origET.InFlightWrites)) 87 copy(et.LockSpans, origET.LockSpans) 88 for i, w := range origET.InFlightWrites { 89 et.LockSpans[len(origET.LockSpans)+i] = roachpb.Span{Key: w.Key} 90 } 91 // See below for why we set Header.DistinctSpans here. 92 et.LockSpans, ba.Header.DistinctSpans = roachpb.MergeSpans(et.LockSpans) 93 return ba, nil 94 } 95 } 96 97 // Slow-path: If not then we remove each transaction write in the batch from 98 // the in-flight write set and merge it into the lock spans. 99 copiedTo := 0 100 for _, ru := range otherReqs { 101 req := ru.GetInner() 102 seq := req.Header().Sequence 103 switch { 104 case roachpb.IsIntentWrite(req) && !roachpb.IsRange(req): 105 // Concurrent point write. 106 case req.Method() == roachpb.QueryIntent: 107 // Earlier pipelined point write that hasn't been proven yet. We 108 // could remove from the in-flight writes set when we see these, 109 // but doing so would prevent us from using the optimization we 110 // have below where we rely on increasing sequence numbers for 111 // each subsequent request. 112 // 113 // We already don't intend on sending QueryIntent requests in the 114 // same batch as EndTxn requests because doing so causes a pipeline 115 // stall, so this doesn't seem worthwhile to support. 116 continue 117 default: 118 // Ranged write or read. These can make it into the final batch with 119 // a parallel committing EndTxn request if the entire batch issued 120 // by DistSender lands on the same range. Skip. 121 continue 122 } 123 124 // Remove the write from the in-flight writes set. We only need to 125 // search from after the previously removed sequence number forward 126 // because both the InFlightWrites and the Requests in the batch are 127 // stored in increasing sequence order. 128 // 129 // Maintaining an iterator into the in-flight writes slice and scanning 130 // instead of performing a binary search on each request changes the 131 // complexity of this loop from O(n*log(m)) to O(m) where n is the 132 // number of point writes in the batch and m is the number of in-flight 133 // writes. These complexities aren't directly comparable, but copying 134 // all unstripped writes back into et.InFlightWrites is already O(m), 135 // so the approach here was preferred over repeat binary searches. 136 match := -1 137 for i, w := range origET.InFlightWrites[copiedTo:] { 138 if w.Sequence == seq { 139 match = i + copiedTo 140 break 141 } 142 } 143 if match == -1 { 144 return ba, errors.New("write in batch with EndTxn missing from in-flight writes") 145 } 146 w := origET.InFlightWrites[match] 147 notInBa := origET.InFlightWrites[copiedTo:match] 148 et.InFlightWrites = append(et.InFlightWrites, notInBa...) 149 copiedTo = match + 1 150 151 // Move the write to the lock spans set since it's no 152 // longer being tracked in the in-flight write set. 153 et.LockSpans = append(et.LockSpans, roachpb.Span{Key: w.Key}) 154 } 155 if et != origET { 156 // Finish building up the remaining in-flight writes. 157 notInBa := origET.InFlightWrites[copiedTo:] 158 et.InFlightWrites = append(et.InFlightWrites, notInBa...) 159 // Re-sort and merge the lock spans. We can set the batch request's 160 // DistinctSpans flag based on whether any of in-flight writes in this 161 // batch overlap with each other. This will have (rare) false negatives 162 // when the in-flight writes overlap with existing lock spans, but never 163 // false positives. 164 et.LockSpans, ba.Header.DistinctSpans = roachpb.MergeSpans(et.LockSpans) 165 } 166 return ba, nil 167 } 168 169 // maybeBumpReadTimestampToWriteTimestamp bumps the batch's read timestamp to 170 // the write timestamp for transactional batches where these timestamp have 171 // diverged and where bumping is possible. When possible, this allows the 172 // transaction to commit without having to retry. 173 // 174 // Returns true if the timestamp was bumped. 175 // 176 // Note that this, like all the server-side bumping of the read timestamp, only 177 // works for batches that exclusively contain writes; reads cannot be bumped 178 // like this because they've already acquired timestamp-aware latches. 179 func maybeBumpReadTimestampToWriteTimestamp( 180 ctx context.Context, ba *roachpb.BatchRequest, latchSpans *spanset.SpanSet, 181 ) bool { 182 if ba.Txn == nil { 183 return false 184 } 185 if ba.Txn.ReadTimestamp.Equal(ba.Txn.WriteTimestamp) { 186 return false 187 } 188 arg, ok := ba.GetArg(roachpb.EndTxn) 189 if !ok { 190 return false 191 } 192 etArg := arg.(*roachpb.EndTxnRequest) 193 if batcheval.CanForwardCommitTimestampWithoutRefresh(ba.Txn, etArg) && 194 !batcheval.IsEndTxnExceedingDeadline(ba.Txn.WriteTimestamp, etArg) { 195 return tryBumpBatchTimestamp(ctx, ba, ba.Txn.WriteTimestamp, latchSpans) 196 } 197 return false 198 } 199 200 // tryBumpBatchTimestamp attempts to bump ba's read and write timestamps to ts. 201 // 202 // Returns true if the timestamp was bumped. Returns false if the timestamp could 203 // not be bumped. 204 func tryBumpBatchTimestamp( 205 ctx context.Context, ba *roachpb.BatchRequest, ts hlc.Timestamp, latchSpans *spanset.SpanSet, 206 ) bool { 207 if latchSpans.MaxProtectedTimestamp().Less(ts) { 208 // If the batch acquired any read latches with bounded (MVCC) timestamps 209 // below this new timestamp then we can not trivially bump the batch's 210 // timestamp without dropping and re-acquiring those latches. Doing so 211 // could allow the request to read at an unprotected timestamp. 212 // 213 // NOTE: we could consider adding a retry-loop above the latch 214 // acquisition to allow this to be retried, but given that we try not to 215 // mix read-only and read-write requests, doing so doesn't seem worth 216 // it. 217 return false 218 } 219 if ts.Less(ba.Timestamp) { 220 log.Fatalf(ctx, "trying to bump to %s <= ba.Timestamp: %s", ts, ba.Timestamp) 221 } 222 ba.Timestamp = ts 223 if txn := ba.Txn; txn == nil { 224 return true 225 } 226 if ts.Less(ba.Txn.ReadTimestamp) || ts.Less(ba.Txn.WriteTimestamp) { 227 log.Fatalf(ctx, "trying to bump to %s inconsistent with ba.Txn.ReadTimestamp: %s, "+ 228 "ba.Txn.WriteTimestamp: %s", ts, ba.Txn.ReadTimestamp, ba.Txn.WriteTimestamp) 229 } 230 log.VEventf(ctx, 2, "bumping batch timestamp to: %s from read: %s, write: %s)", 231 ts, ba.Txn.ReadTimestamp, ba.Txn.WriteTimestamp) 232 ba.Txn = ba.Txn.Clone() 233 ba.Txn.ReadTimestamp = ts 234 ba.Txn.WriteTimestamp = ts 235 ba.Txn.WriteTooOld = false 236 return true 237 }