github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/bulk/sst_batcher.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package bulk 12 13 import ( 14 "bytes" 15 "context" 16 "time" 17 18 "github.com/cockroachdb/cockroach/pkg/clusterversion" 19 "github.com/cockroachdb/cockroach/pkg/keys" 20 "github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord" 21 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 22 "github.com/cockroachdb/cockroach/pkg/roachpb" 23 "github.com/cockroachdb/cockroach/pkg/settings" 24 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 25 "github.com/cockroachdb/cockroach/pkg/storage" 26 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 27 "github.com/cockroachdb/cockroach/pkg/util/hlc" 28 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 29 "github.com/cockroachdb/cockroach/pkg/util/log" 30 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 31 "github.com/cockroachdb/errors" 32 ) 33 34 var ( 35 tooSmallSSTSize = settings.RegisterByteSizeSetting( 36 "kv.bulk_io_write.small_write_size", 37 "size below which a 'bulk' write will be performed as a normal write instead", 38 400*1<<10, // 400 Kib 39 ) 40 ) 41 42 type sz int64 43 44 func (b sz) String() string { 45 return humanizeutil.IBytes(int64(b)) 46 } 47 48 // SSTBatcher is a helper for bulk-adding many KVs in chunks via AddSSTable. An 49 // SSTBatcher can be handed KVs repeatedly and will make them into SSTs that are 50 // added when they reach the configured size, tracking the total added rows, 51 // bytes, etc. If configured with a non-nil, populated range cache, it will use 52 // it to attempt to flush SSTs before they cross range boundaries to minimize 53 // expensive on-split retries. 54 type SSTBatcher struct { 55 db SSTSender 56 rc *kvcoord.RangeDescriptorCache 57 settings *cluster.Settings 58 maxSize func() int64 59 splitAfter func() int64 60 61 // allows ingestion of keys where the MVCC.Key would shadow an existing row. 62 disallowShadowing bool 63 // skips duplicate keys (iff they are buffered together). This is true when 64 // used to backfill an inverted index. An array in JSONB with multiple values 65 // which are the same, will all correspond to the same kv in the inverted 66 // index. The method which generates these kvs does not dedup, thus we rely on 67 // the SSTBatcher to dedup them (by skipping), rather than throwing a 68 // DuplicateKeyError. This is also true when used with IMPORT. Import 69 // generally prohibits the ingestion of KVs which will shadow existing data, 70 // with the exception of duplicates having the same value and timestamp. To 71 // maintain uniform behavior, duplicates in the same batch with equal values 72 // will not raise a DuplicateKeyError. 73 skipDuplicates bool 74 75 // The rest of the fields accumulated state as opposed to configuration. Some, 76 // like totalRows, are accumulated _across_ batches and are not reset between 77 // batches when Reset() is called. 78 totalRows roachpb.BulkOpSummary 79 flushCounts struct { 80 total int 81 split int 82 sstSize int 83 files int // a single flush might create multiple files. 84 85 sendWait time.Duration 86 splitWait time.Duration 87 } 88 // Tracking for if we have "filled" a range in case we want to split/scatter. 89 flushedToCurrentRange int64 90 lastFlushKey []byte 91 92 // The rest of the fields are per-batch and are reset via Reset() before each 93 // batch is started. 94 sstWriter storage.SSTWriter 95 sstFile *storage.MemFile 96 batchStartKey []byte 97 batchEndKey []byte 98 batchEndValue []byte 99 flushKeyChecked bool 100 flushKey roachpb.Key 101 // stores on-the-fly stats for the SST if disallowShadowing is true. 102 ms enginepb.MVCCStats 103 // rows written in the current batch. 104 rowCounter storage.RowCounter 105 } 106 107 // MakeSSTBatcher makes a ready-to-use SSTBatcher. 108 func MakeSSTBatcher( 109 ctx context.Context, db SSTSender, settings *cluster.Settings, flushBytes func() int64, 110 ) (*SSTBatcher, error) { 111 b := &SSTBatcher{db: db, settings: settings, maxSize: flushBytes, disallowShadowing: true} 112 err := b.Reset(ctx) 113 return b, err 114 } 115 116 func (b *SSTBatcher) updateMVCCStats(key storage.MVCCKey, value []byte) { 117 metaKeySize := int64(len(key.Key)) + 1 118 metaValSize := int64(0) 119 b.ms.LiveBytes += metaKeySize 120 b.ms.LiveCount++ 121 b.ms.KeyBytes += metaKeySize 122 b.ms.ValBytes += metaValSize 123 b.ms.KeyCount++ 124 125 totalBytes := int64(len(value)) + storage.MVCCVersionTimestampSize 126 b.ms.LiveBytes += totalBytes 127 b.ms.KeyBytes += storage.MVCCVersionTimestampSize 128 b.ms.ValBytes += int64(len(value)) 129 b.ms.ValCount++ 130 } 131 132 // AddMVCCKey adds a key+timestamp/value pair to the batch (flushing if needed). 133 // This is only for callers that want to control the timestamp on individual 134 // keys -- like RESTORE where we want the restored data to look the like backup. 135 // Keys must be added in order. 136 func (b *SSTBatcher) AddMVCCKey(ctx context.Context, key storage.MVCCKey, value []byte) error { 137 if len(b.batchEndKey) > 0 && bytes.Equal(b.batchEndKey, key.Key) { 138 if b.skipDuplicates && bytes.Equal(b.batchEndValue, value) { 139 return nil 140 } 141 142 err := &kvserverbase.DuplicateKeyError{} 143 err.Key = append(err.Key, key.Key...) 144 err.Value = append(err.Value, value...) 145 return err 146 } 147 // Check if we need to flush current batch *before* adding the next k/v -- 148 // the batcher may want to flush the keys it already has, either because it 149 // is full or because it wants this key in a separate batch due to splits. 150 if err := b.flushIfNeeded(ctx, key.Key); err != nil { 151 return err 152 } 153 154 // Update the range currently represented in this batch, as necessary. 155 if len(b.batchStartKey) == 0 { 156 b.batchStartKey = append(b.batchStartKey[:0], key.Key...) 157 } 158 b.batchEndKey = append(b.batchEndKey[:0], key.Key...) 159 b.batchEndValue = append(b.batchEndValue[:0], value...) 160 161 if err := b.rowCounter.Count(key.Key); err != nil { 162 return err 163 } 164 165 // If we do not allowing shadowing of keys when ingesting an SST via 166 // AddSSTable, then we can update the MVCCStats on the fly because we are 167 // guaranteed to ingest unique keys. This saves us an extra iteration in 168 // AddSSTable which has been identified as a significant performance 169 // regression for IMPORT. 170 if b.disallowShadowing { 171 b.updateMVCCStats(key, value) 172 } 173 174 return b.sstWriter.Put(key, value) 175 } 176 177 // Reset clears all state in the batcher and prepares it for reuse. 178 func (b *SSTBatcher) Reset(ctx context.Context) error { 179 b.sstWriter.Close() 180 b.sstFile = &storage.MemFile{} 181 // Create "Ingestion" SSTs in the newer RocksDBv2 format only if all nodes 182 // in the cluster can support it. Until then, for backward compatibility, 183 // create SSTs in the leveldb format ("backup" ones). 184 if b.settings.Version.IsActive(ctx, clusterversion.VersionStart20_1) { 185 b.sstWriter = storage.MakeIngestionSSTWriter(b.sstFile) 186 } else { 187 b.sstWriter = storage.MakeBackupSSTWriter(b.sstFile) 188 } 189 b.batchStartKey = b.batchStartKey[:0] 190 b.batchEndKey = b.batchEndKey[:0] 191 b.batchEndValue = b.batchEndValue[:0] 192 b.flushKey = nil 193 b.flushKeyChecked = false 194 b.ms.Reset() 195 196 b.rowCounter.BulkOpSummary.Reset() 197 return nil 198 } 199 200 const ( 201 manualFlush = iota 202 sizeFlush 203 rangeFlush 204 ) 205 206 func (b *SSTBatcher) flushIfNeeded(ctx context.Context, nextKey roachpb.Key) error { 207 // If this is the first key we have seen (since being reset), attempt to find 208 // the end of the range it is in so we can flush the SST before crossing it, 209 // because AddSSTable cannot span ranges and will need to be split and retried 210 // from scratch if we generate an SST that ends up doing so. 211 if !b.flushKeyChecked && b.rc != nil { 212 b.flushKeyChecked = true 213 if k, err := keys.Addr(nextKey); err != nil { 214 log.Warningf(ctx, "failed to get RKey for flush key lookup") 215 } else { 216 r := b.rc.GetCachedRangeDescriptor(k, false /* inverted */) 217 if r != nil { 218 b.flushKey = r.EndKey.AsRawKey() 219 log.VEventf(ctx, 3, "building sstable that will flush before %v", b.flushKey) 220 } else { 221 log.VEventf(ctx, 3, "no cached range desc available to determine sst flush key") 222 } 223 } 224 } 225 226 if b.flushKey != nil && b.flushKey.Compare(nextKey) <= 0 { 227 if err := b.doFlush(ctx, rangeFlush, nil); err != nil { 228 return err 229 } 230 return b.Reset(ctx) 231 } 232 233 if b.sstWriter.DataSize >= b.maxSize() { 234 if err := b.doFlush(ctx, sizeFlush, nextKey); err != nil { 235 return err 236 } 237 return b.Reset(ctx) 238 } 239 return nil 240 } 241 242 // Flush sends the current batch, if any. 243 func (b *SSTBatcher) Flush(ctx context.Context) error { 244 return b.doFlush(ctx, manualFlush, nil) 245 } 246 247 func (b *SSTBatcher) doFlush(ctx context.Context, reason int, nextKey roachpb.Key) error { 248 if b.sstWriter.DataSize == 0 { 249 return nil 250 } 251 b.flushCounts.total++ 252 253 hour := hlc.Timestamp{WallTime: timeutil.Now().Add(time.Hour).UnixNano()} 254 255 start := roachpb.Key(append([]byte(nil), b.batchStartKey...)) 256 // The end key of the WriteBatch request is exclusive, but batchEndKey is 257 // currently the largest key in the batch. Increment it. 258 end := roachpb.Key(append([]byte(nil), b.batchEndKey...)).Next() 259 260 size := b.sstWriter.DataSize 261 if reason == sizeFlush { 262 log.VEventf(ctx, 3, "flushing %s SST due to size > %s", sz(size), sz(b.maxSize())) 263 b.flushCounts.sstSize++ 264 265 // On first flush, if it is due to size, we introduce one split at the start 266 // of our span, since size means we didn't already hit one. When adders have 267 // non-overlapping keyspace this split partitions off "our" target space for 268 // future splitting/scattering, while if they don't, doing this only once 269 // minimizes impact on other adders (e.g. causing extra SST splitting). 270 if b.flushCounts.total == 1 { 271 if splitAt, err := keys.EnsureSafeSplitKey(start); err != nil { 272 log.Warningf(ctx, "%v", err) 273 } else { 274 // NB: Passing 'hour' here is technically illegal until 19.2 is 275 // active, but the value will be ignored before that, and we don't 276 // have access to the cluster version here. 277 if err := b.db.SplitAndScatter(ctx, splitAt, hour); err != nil { 278 log.Warningf(ctx, "%v", err) 279 } 280 } 281 } 282 } else if reason == rangeFlush { 283 log.VEventf(ctx, 3, "flushing %s SST due to range boundary %s", sz(size), b.flushKey) 284 b.flushCounts.split++ 285 } 286 287 err := b.sstWriter.Finish() 288 if err != nil { 289 return errors.Wrapf(err, "finishing constructed sstable") 290 } 291 292 // If the stats have been computed on-the-fly, set the last updated time 293 // before ingesting the SST. 294 if (b.ms != enginepb.MVCCStats{}) { 295 b.ms.LastUpdateNanos = timeutil.Now().UnixNano() 296 } 297 298 beforeSend := timeutil.Now() 299 files, err := AddSSTable(ctx, b.db, start, end, b.sstFile.Data(), b.disallowShadowing, b.ms, b.settings) 300 if err != nil { 301 return err 302 } 303 b.flushCounts.sendWait += timeutil.Since(beforeSend) 304 305 b.flushCounts.files += files 306 if b.flushKey != nil { 307 // If the flush-before key hasn't changed we know we don't think we passed 308 // a range boundary, and if the files-added count is 1 we didn't hit an 309 // unexpected split either, so assume we added to the same range. 310 if reason == sizeFlush && bytes.Equal(b.flushKey, b.lastFlushKey) && files == 1 { 311 b.flushedToCurrentRange += size 312 } else { 313 // Assume we started adding to new different range with this SST. 314 b.lastFlushKey = append(b.lastFlushKey[:0], b.flushKey...) 315 b.flushedToCurrentRange = size 316 } 317 if b.splitAfter != nil { 318 if splitAfter := b.splitAfter(); b.flushedToCurrentRange > splitAfter && nextKey != nil { 319 if splitAt, err := keys.EnsureSafeSplitKey(nextKey); err != nil { 320 log.Warningf(ctx, "%v", err) 321 } else { 322 beforeSplit := timeutil.Now() 323 324 log.VEventf(ctx, 2, "%s added since last split, splitting/scattering for next range at %v", sz(b.flushedToCurrentRange), end) 325 // NB: Passing 'hour' here is technically illegal until 19.2 is 326 // active, but the value will be ignored before that, and we don't 327 // have access to the cluster version here. 328 if err := b.db.SplitAndScatter(ctx, splitAt, hour); err != nil { 329 log.Warningf(ctx, "failed to split and scatter during ingest: %+v", err) 330 } 331 b.flushCounts.splitWait += timeutil.Since(beforeSplit) 332 } 333 b.flushedToCurrentRange = 0 334 } 335 } 336 } 337 338 b.totalRows.Add(b.rowCounter.BulkOpSummary) 339 b.totalRows.DataSize += b.sstWriter.DataSize 340 return nil 341 } 342 343 // Close closes the underlying SST builder. 344 func (b *SSTBatcher) Close() { 345 b.sstWriter.Close() 346 } 347 348 // GetSummary returns this batcher's total added rows/bytes/etc. 349 func (b *SSTBatcher) GetSummary() roachpb.BulkOpSummary { 350 return b.totalRows 351 } 352 353 // SSTSender is an interface to send SST data to an engine. 354 type SSTSender interface { 355 AddSSTable( 356 ctx context.Context, begin, end interface{}, data []byte, disallowShadowing bool, stats *enginepb.MVCCStats, ingestAsWrites bool, 357 ) error 358 SplitAndScatter(ctx context.Context, key roachpb.Key, expirationTime hlc.Timestamp) error 359 } 360 361 type sstSpan struct { 362 start, end roachpb.Key 363 sstBytes []byte 364 disallowShadowing bool 365 stats enginepb.MVCCStats 366 } 367 368 // AddSSTable retries db.AddSSTable if retryable errors occur, including if the 369 // SST spans a split, in which case it is iterated and split into two SSTs, one 370 // for each side of the split in the error, and each are retried. 371 func AddSSTable( 372 ctx context.Context, 373 db SSTSender, 374 start, end roachpb.Key, 375 sstBytes []byte, 376 disallowShadowing bool, 377 ms enginepb.MVCCStats, 378 settings *cluster.Settings, 379 ) (int, error) { 380 var files int 381 now := timeutil.Now() 382 iter, err := storage.NewMemSSTIterator(sstBytes, true) 383 if err != nil { 384 return 0, err 385 } 386 defer iter.Close() 387 388 var stats enginepb.MVCCStats 389 if (ms == enginepb.MVCCStats{}) { 390 stats, err = storage.ComputeStatsGo(iter, start, end, now.UnixNano()) 391 if err != nil { 392 return 0, errors.Wrapf(err, "computing stats for SST [%s, %s)", start, end) 393 } 394 } else { 395 stats = ms 396 } 397 398 work := []*sstSpan{{start: start, end: end, sstBytes: sstBytes, disallowShadowing: disallowShadowing, stats: stats}} 399 const maxAddSSTableRetries = 10 400 for len(work) > 0 { 401 item := work[0] 402 work = work[1:] 403 if err := func() error { 404 var err error 405 for i := 0; i < maxAddSSTableRetries; i++ { 406 log.VEventf(ctx, 2, "sending %s AddSSTable [%s,%s)", sz(len(item.sstBytes)), item.start, item.end) 407 before := timeutil.Now() 408 // If this SST is "too small", the fixed costs associated with adding an 409 // SST – in terms of triggering flushes, extra compactions, etc – would 410 // exceed the savings we get from skipping regular, key-by-key writes, 411 // and we're better off just putting its contents in a regular batch. 412 // This isn't perfect: We're still incurring extra overhead constructing 413 // SSTables just for use as a wire-format, but the rest of the 414 // implementation of bulk-ingestion assumes certainly semantics of the 415 // AddSSTable API - like ingest at arbitrary timestamps or collision 416 // detection - making it is simpler to just always use the same API 417 // and just switch how it writes its result. 418 ingestAsWriteBatch := false 419 if settings != nil && int64(len(item.sstBytes)) < tooSmallSSTSize.Get(&settings.SV) { 420 log.VEventf(ctx, 2, "ingest data is too small (%d keys/%d bytes) for SSTable, adding via regular batch", item.stats.KeyCount, len(item.sstBytes)) 421 ingestAsWriteBatch = true 422 } 423 // This will fail if the range has split but we'll check for that below. 424 err = db.AddSSTable(ctx, item.start, item.end, item.sstBytes, item.disallowShadowing, &item.stats, ingestAsWriteBatch) 425 if err == nil { 426 log.VEventf(ctx, 3, "adding %s AddSSTable [%s,%s) took %v", sz(len(item.sstBytes)), item.start, item.end, timeutil.Since(before)) 427 return nil 428 } 429 // This range has split -- we need to split the SST to try again. 430 if m := (*roachpb.RangeKeyMismatchError)(nil); errors.As(err, &m) { 431 split := m.MismatchedRange.EndKey.AsRawKey() 432 log.Infof(ctx, "SSTable cannot be added spanning range bounds %v, retrying...", split) 433 left, right, err := createSplitSSTable(ctx, db, item.start, split, item.disallowShadowing, iter, settings) 434 if err != nil { 435 return err 436 } 437 438 right.stats, err = storage.ComputeStatsGo( 439 iter, right.start, right.end, now.UnixNano(), 440 ) 441 if err != nil { 442 return err 443 } 444 left.stats = item.stats 445 left.stats.Subtract(right.stats) 446 447 // Add more work. 448 work = append([]*sstSpan{left, right}, work...) 449 return nil 450 } 451 // Retry on AmbiguousResult. 452 if errors.HasType(err, (*roachpb.AmbiguousResultError)(nil)) { 453 log.Warningf(ctx, "addsstable [%s,%s) attempt %d failed: %+v", start, end, i, err) 454 continue 455 } 456 } 457 return errors.Wrapf(err, "addsstable [%s,%s)", item.start, item.end) 458 }(); err != nil { 459 return files, err 460 } 461 files++ 462 // explicitly deallocate SST. This will not deallocate the 463 // top level SST which is kept around to iterate over. 464 item.sstBytes = nil 465 } 466 log.VEventf(ctx, 3, "AddSSTable [%v, %v) added %d files and took %v", start, end, files, timeutil.Since(now)) 467 return files, nil 468 } 469 470 // createSplitSSTable is a helper for splitting up SSTs. The iterator 471 // passed in is over the top level SST passed into AddSSTTable(). 472 func createSplitSSTable( 473 ctx context.Context, 474 db SSTSender, 475 start, splitKey roachpb.Key, 476 disallowShadowing bool, 477 iter storage.SimpleIterator, 478 settings *cluster.Settings, 479 ) (*sstSpan, *sstSpan, error) { 480 sstFile := &storage.MemFile{} 481 var w storage.SSTWriter 482 if settings.Version.IsActive(ctx, clusterversion.VersionStart20_1) { 483 w = storage.MakeIngestionSSTWriter(sstFile) 484 } else { 485 w = storage.MakeBackupSSTWriter(sstFile) 486 } 487 defer w.Close() 488 489 split := false 490 var first, last roachpb.Key 491 var left, right *sstSpan 492 493 iter.SeekGE(storage.MVCCKey{Key: start}) 494 for { 495 if ok, err := iter.Valid(); err != nil { 496 return nil, nil, err 497 } else if !ok { 498 break 499 } 500 501 key := iter.UnsafeKey() 502 503 if !split && key.Key.Compare(splitKey) >= 0 { 504 err := w.Finish() 505 506 if err != nil { 507 return nil, nil, err 508 } 509 left = &sstSpan{ 510 start: first, 511 end: last.PrefixEnd(), 512 sstBytes: sstFile.Data(), 513 disallowShadowing: disallowShadowing, 514 } 515 *sstFile = storage.MemFile{} 516 if settings.Version.IsActive(ctx, clusterversion.VersionStart20_1) { 517 w = storage.MakeIngestionSSTWriter(sstFile) 518 } else { 519 w = storage.MakeBackupSSTWriter(sstFile) 520 } 521 split = true 522 first = nil 523 last = nil 524 } 525 526 if len(first) == 0 { 527 first = append(first[:0], key.Key...) 528 } 529 last = append(last[:0], key.Key...) 530 531 if err := w.Put(key, iter.UnsafeValue()); err != nil { 532 return nil, nil, err 533 } 534 535 iter.Next() 536 } 537 538 err := w.Finish() 539 if err != nil { 540 return nil, nil, err 541 } 542 right = &sstSpan{ 543 start: first, 544 end: last.PrefixEnd(), 545 sstBytes: sstFile.Data(), 546 disallowShadowing: disallowShadowing, 547 } 548 return left, right, nil 549 }