github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/row/kv_batch_fetcher.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package row 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 18 "github.com/cockroachdb/cockroach/pkg/kv" 19 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock" 20 "github.com/cockroachdb/cockroach/pkg/roachpb" 21 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 22 "github.com/cockroachdb/cockroach/pkg/util" 23 "github.com/cockroachdb/cockroach/pkg/util/log" 24 "github.com/cockroachdb/errors" 25 ) 26 27 // kvBatchSize is the number of keys we request at a time. 28 // On a single node, 1000 was enough to avoid any performance degradation. On 29 // multi-node clusters, we want bigger chunks to make up for the higher latency. 30 // TODO(radu): parameters like this should be configurable 31 var kvBatchSize int64 = 10000 32 33 // TestingSetKVBatchSize changes the kvBatchFetcher batch size, and returns a function that restores it. 34 // This is to be used only in tests - we have no test coverage for arbitrary kv batch sizes at this time. 35 func TestingSetKVBatchSize(val int64) func() { 36 oldVal := kvBatchSize 37 kvBatchSize = val 38 return func() { kvBatchSize = oldVal } 39 } 40 41 // sendFunc is the function used to execute a KV batch; normally 42 // wraps (*client.Txn).Send. 43 type sendFunc func( 44 ctx context.Context, ba roachpb.BatchRequest, 45 ) (*roachpb.BatchResponse, error) 46 47 // txnKVFetcher handles retrieval of key/values. 48 type txnKVFetcher struct { 49 // "Constant" fields, provided by the caller. 50 sendFn sendFunc 51 spans roachpb.Spans 52 // If useBatchLimit is true, batches are limited to kvBatchSize. If 53 // firstBatchLimit is also set, the first batch is limited to that value. 54 // Subsequent batches are larger, up to kvBatchSize. 55 firstBatchLimit int64 56 useBatchLimit bool 57 reverse bool 58 // lockStr represents the locking mode to use when fetching KVs. 59 lockStr sqlbase.ScanLockingStrength 60 // returnRangeInfo, if set, causes the kvBatchFetcher to populate rangeInfos. 61 // See also rowFetcher.returnRangeInfo. 62 returnRangeInfo bool 63 64 fetchEnd bool 65 batchIdx int 66 67 // requestSpans contains the spans that were requested in the last request, 68 // and is one to one with responses. This field is kept separately from spans 69 // so that the fetcher can keep track of which response was produced for each 70 // input span. 71 requestSpans roachpb.Spans 72 responses []roachpb.ResponseUnion 73 74 // As the kvBatchFetcher fetches batches of kvs, it accumulates information on the 75 // replicas where the batches came from. This info can be retrieved through 76 // getRangeInfo(), to be used for updating caches. 77 // rangeInfos are deduped, so they're not ordered in any particular way and 78 // they don't map to kvBatchFetcher.spans in any particular way. 79 rangeInfos []roachpb.RangeInfo 80 origSpan roachpb.Span 81 remainingBatches [][]byte 82 } 83 84 var _ kvBatchFetcher = &txnKVFetcher{} 85 86 func (f *txnKVFetcher) GetRangesInfo() []roachpb.RangeInfo { 87 if !f.returnRangeInfo { 88 panic(errors.AssertionFailedf("GetRangesInfo() called on kvBatchFetcher that wasn't configured with returnRangeInfo")) 89 } 90 return f.rangeInfos 91 } 92 93 // getBatchSize returns the max size of the next batch. 94 func (f *txnKVFetcher) getBatchSize() int64 { 95 return f.getBatchSizeForIdx(f.batchIdx) 96 } 97 98 func (f *txnKVFetcher) getBatchSizeForIdx(batchIdx int) int64 { 99 if !f.useBatchLimit { 100 return 0 101 } 102 if f.firstBatchLimit == 0 || f.firstBatchLimit >= kvBatchSize { 103 return kvBatchSize 104 } 105 106 // We grab the first batch according to the limit. If it turns out that we 107 // need another batch, we grab a bigger batch. If that's still not enough, 108 // we revert to the default batch size. 109 switch batchIdx { 110 case 0: 111 return f.firstBatchLimit 112 113 case 1: 114 // Make the second batch 10 times larger (but at most the default batch 115 // size and at least 1/10 of the default batch size). Sample 116 // progressions of batch sizes: 117 // 118 // First batch | Second batch | Subsequent batches 119 // ----------------------------------------------- 120 // 1 | 1,000 | 10,000 121 // 100 | 1,000 | 10,000 122 // 500 | 5,000 | 10,000 123 // 1000 | 10,000 | 10,000 124 secondBatch := f.firstBatchLimit * 10 125 switch { 126 case secondBatch < kvBatchSize/10: 127 return kvBatchSize / 10 128 case secondBatch > kvBatchSize: 129 return kvBatchSize 130 default: 131 return secondBatch 132 } 133 134 default: 135 return kvBatchSize 136 } 137 } 138 139 // getKeyLockingStrength returns the configured per-key locking strength to use 140 // for key-value scans. 141 func (f *txnKVFetcher) getKeyLockingStrength() lock.Strength { 142 switch f.lockStr { 143 case sqlbase.ScanLockingStrength_FOR_NONE: 144 return lock.None 145 146 case sqlbase.ScanLockingStrength_FOR_KEY_SHARE: 147 // Promote to FOR_SHARE. 148 fallthrough 149 case sqlbase.ScanLockingStrength_FOR_SHARE: 150 // We currently perform no per-key locking when FOR_SHARE is used 151 // because Shared locks have not yet been implemented. 152 return lock.None 153 154 case sqlbase.ScanLockingStrength_FOR_NO_KEY_UPDATE: 155 // Promote to FOR_UPDATE. 156 fallthrough 157 case sqlbase.ScanLockingStrength_FOR_UPDATE: 158 // We currently perform exclusive per-key locking when FOR_UPDATE is 159 // used because Upgrade locks have not yet been implemented. 160 return lock.Exclusive 161 162 default: 163 panic(fmt.Sprintf("unknown locking strength %s", f.lockStr)) 164 } 165 } 166 167 // makeKVBatchFetcher initializes a kvBatchFetcher for the given spans. 168 // 169 // If useBatchLimit is true, batches are limited to kvBatchSize. If 170 // firstBatchLimit is also set, the first batch is limited to that value. 171 // Subsequent batches are larger, up to kvBatchSize. 172 // 173 // Batch limits can only be used if the spans are ordered. 174 func makeKVBatchFetcher( 175 txn *kv.Txn, 176 spans roachpb.Spans, 177 reverse bool, 178 useBatchLimit bool, 179 firstBatchLimit int64, 180 lockStr sqlbase.ScanLockingStrength, 181 returnRangeInfo bool, 182 ) (txnKVFetcher, error) { 183 sendFn := func(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, error) { 184 res, err := txn.Send(ctx, ba) 185 if err != nil { 186 return nil, err.GoError() 187 } 188 return res, nil 189 } 190 return makeKVBatchFetcherWithSendFunc( 191 sendFn, spans, reverse, useBatchLimit, firstBatchLimit, lockStr, returnRangeInfo, 192 ) 193 } 194 195 // makeKVBatchFetcherWithSendFunc is like makeKVBatchFetcher but uses a custom 196 // send function. 197 func makeKVBatchFetcherWithSendFunc( 198 sendFn sendFunc, 199 spans roachpb.Spans, 200 reverse bool, 201 useBatchLimit bool, 202 firstBatchLimit int64, 203 lockStr sqlbase.ScanLockingStrength, 204 returnRangeInfo bool, 205 ) (txnKVFetcher, error) { 206 if firstBatchLimit < 0 || (!useBatchLimit && firstBatchLimit != 0) { 207 return txnKVFetcher{}, errors.Errorf("invalid batch limit %d (useBatchLimit: %t)", 208 firstBatchLimit, useBatchLimit) 209 } 210 211 if useBatchLimit { 212 // Verify the spans are ordered if a batch limit is used. 213 for i := 1; i < len(spans); i++ { 214 if spans[i].Key.Compare(spans[i-1].EndKey) < 0 { 215 return txnKVFetcher{}, errors.Errorf("unordered spans (%s %s)", spans[i-1], spans[i]) 216 } 217 } 218 } else if util.RaceEnabled { 219 // Otherwise, just verify the spans don't contain consecutive overlapping 220 // spans. 221 for i := 1; i < len(spans); i++ { 222 if spans[i].Key.Compare(spans[i-1].EndKey) >= 0 { 223 // Current span's start key is greater than or equal to the last span's 224 // end key - we're good. 225 continue 226 } else if spans[i].EndKey.Compare(spans[i-1].Key) < 0 { 227 // Current span's end key is less than or equal to the last span's start 228 // key - also good. 229 continue 230 } 231 // Otherwise, the two spans overlap, which isn't allowed - it leaves us at 232 // risk of incorrect results, since the row fetcher can't distinguish 233 // between identical rows in two different batches. 234 return txnKVFetcher{}, errors.Errorf("overlapping neighbor spans (%s %s)", spans[i-1], spans[i]) 235 } 236 } 237 238 // Make a copy of the spans because we update them. 239 copySpans := make(roachpb.Spans, len(spans)) 240 for i := range spans { 241 if reverse { 242 // Reverse scans receive the spans in decreasing order. 243 copySpans[len(spans)-i-1] = spans[i] 244 } else { 245 copySpans[i] = spans[i] 246 } 247 } 248 249 return txnKVFetcher{ 250 sendFn: sendFn, 251 spans: copySpans, 252 reverse: reverse, 253 useBatchLimit: useBatchLimit, 254 firstBatchLimit: firstBatchLimit, 255 lockStr: lockStr, 256 returnRangeInfo: returnRangeInfo, 257 }, nil 258 } 259 260 // fetch retrieves spans from the kv 261 func (f *txnKVFetcher) fetch(ctx context.Context) error { 262 var ba roachpb.BatchRequest 263 ba.Header.MaxSpanRequestKeys = f.getBatchSize() 264 if ba.Header.MaxSpanRequestKeys > 0 { 265 // If this kvfetcher limits the number of rows returned, also use 266 // target bytes to guard against the case in which the average row 267 // is very large. 268 // If no limit is set, the assumption is that SQL *knows* that there 269 // is only a "small" amount of data to be read, and wants to preserve 270 // concurrency for this request inside of DistSender, which setting 271 // TargetBytes would interfere with. 272 ba.Header.TargetBytes = 10 * (1 << 20) 273 } 274 ba.Header.ReturnRangeInfo = f.returnRangeInfo 275 ba.Requests = make([]roachpb.RequestUnion, len(f.spans)) 276 keyLocking := f.getKeyLockingStrength() 277 if f.reverse { 278 scans := make([]roachpb.ReverseScanRequest, len(f.spans)) 279 for i := range f.spans { 280 scans[i].SetSpan(f.spans[i]) 281 scans[i].ScanFormat = roachpb.BATCH_RESPONSE 282 scans[i].KeyLocking = keyLocking 283 ba.Requests[i].MustSetInner(&scans[i]) 284 } 285 } else { 286 scans := make([]roachpb.ScanRequest, len(f.spans)) 287 for i := range f.spans { 288 scans[i].SetSpan(f.spans[i]) 289 scans[i].ScanFormat = roachpb.BATCH_RESPONSE 290 scans[i].KeyLocking = keyLocking 291 ba.Requests[i].MustSetInner(&scans[i]) 292 } 293 } 294 if cap(f.requestSpans) < len(f.spans) { 295 f.requestSpans = make(roachpb.Spans, len(f.spans)) 296 } else { 297 f.requestSpans = f.requestSpans[:len(f.spans)] 298 } 299 copy(f.requestSpans, f.spans) 300 301 if log.ExpensiveLogEnabled(ctx, 2) { 302 var buf bytes.Buffer 303 for i, span := range f.spans { 304 if i != 0 { 305 buf.WriteString(", ") 306 } 307 buf.WriteString(span.String()) 308 } 309 log.VEventf(ctx, 2, "Scan %s", buf.String()) 310 } 311 312 // Reset spans in preparation for adding resume-spans below. 313 f.spans = f.spans[:0] 314 315 br, err := f.sendFn(ctx, ba) 316 if err != nil { 317 return err 318 } 319 if br != nil { 320 f.responses = br.Responses 321 } else { 322 f.responses = nil 323 } 324 325 // Set end to true until disproved. 326 f.fetchEnd = true 327 var sawResumeSpan bool 328 for _, resp := range f.responses { 329 reply := resp.GetInner() 330 header := reply.Header() 331 332 if header.NumKeys > 0 && sawResumeSpan { 333 return errors.Errorf( 334 "span with results after resume span; it shouldn't happen given that "+ 335 "we're only scanning non-overlapping spans. New spans: %s", 336 sqlbase.PrettySpans(nil, f.spans, 0 /* skip */)) 337 } 338 339 if resumeSpan := header.ResumeSpan; resumeSpan != nil { 340 // A span needs to be resumed. 341 f.fetchEnd = false 342 f.spans = append(f.spans, *resumeSpan) 343 // Verify we don't receive results for any remaining spans. 344 sawResumeSpan = true 345 } 346 347 // Fill up the RangeInfos, in case we got any. 348 if f.returnRangeInfo { 349 for _, ri := range header.RangeInfos { 350 f.rangeInfos = roachpb.InsertRangeInfo(f.rangeInfos, ri) 351 } 352 } 353 } 354 355 f.batchIdx++ 356 357 // TODO(radu): We should fetch the next chunk in the background instead of waiting for the next 358 // call to fetch(). We can use a pool of workers to issue the KV ops which will also limit the 359 // total number of fetches that happen in parallel (and thus the amount of resources we use). 360 return nil 361 } 362 363 // nextBatch returns the next batch of key/value pairs. If there are none 364 // available, a fetch is initiated. When there are no more keys, ok is false. 365 // origSpan returns the span that batch was fetched from, and bounds all of the 366 // keys returned. 367 func (f *txnKVFetcher) nextBatch( 368 ctx context.Context, 369 ) (ok bool, kvs []roachpb.KeyValue, batchResponse []byte, origSpan roachpb.Span, err error) { 370 if len(f.remainingBatches) > 0 { 371 batch := f.remainingBatches[0] 372 f.remainingBatches = f.remainingBatches[1:] 373 return true, nil, batch, f.origSpan, nil 374 } 375 if len(f.responses) > 0 { 376 reply := f.responses[0].GetInner() 377 f.responses = f.responses[1:] 378 origSpan := f.requestSpans[0] 379 f.requestSpans = f.requestSpans[1:] 380 var batchResp []byte 381 switch t := reply.(type) { 382 case *roachpb.ScanResponse: 383 if len(t.BatchResponses) > 0 { 384 batchResp = t.BatchResponses[0] 385 f.remainingBatches = t.BatchResponses[1:] 386 } 387 return true, t.Rows, batchResp, origSpan, nil 388 case *roachpb.ReverseScanResponse: 389 if len(t.BatchResponses) > 0 { 390 batchResp = t.BatchResponses[0] 391 f.remainingBatches = t.BatchResponses[1:] 392 } 393 return true, t.Rows, batchResp, origSpan, nil 394 } 395 } 396 if f.fetchEnd { 397 return false, nil, nil, roachpb.Span{}, nil 398 } 399 if err := f.fetch(ctx); err != nil { 400 return false, nil, nil, roachpb.Span{}, err 401 } 402 return f.nextBatch(ctx) 403 }