github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/suffix_rewriter.go (about) 1 package sstable 2 3 import ( 4 "bytes" 5 "context" 6 "math" 7 "sync" 8 9 "github.com/cespare/xxhash/v2" 10 "github.com/cockroachdb/errors" 11 "github.com/cockroachdb/pebble/internal/base" 12 "github.com/cockroachdb/pebble/internal/bytealloc" 13 "github.com/cockroachdb/pebble/internal/invariants" 14 "github.com/cockroachdb/pebble/internal/rangekey" 15 "github.com/cockroachdb/pebble/objstorage" 16 ) 17 18 // RewriteKeySuffixes is deprecated. 19 // 20 // TODO(sumeer): remove after switching CockroachDB to RewriteKeySuffixesAndReturnFormat. 21 func RewriteKeySuffixes( 22 sst []byte, 23 rOpts ReaderOptions, 24 out objstorage.Writable, 25 o WriterOptions, 26 from, to []byte, 27 concurrency int, 28 ) (*WriterMetadata, error) { 29 meta, _, err := RewriteKeySuffixesAndReturnFormat(sst, rOpts, out, o, from, to, concurrency) 30 return meta, err 31 } 32 33 // RewriteKeySuffixesAndReturnFormat copies the content of the passed SSTable 34 // bytes to a new sstable, written to `out`, in which the suffix `from` has is 35 // replaced with `to` in every key. The input sstable must consist of only 36 // Sets or RangeKeySets and every key must have `from` as its suffix as 37 // determined by the Split function of the Comparer in the passed 38 // WriterOptions. Range deletes must not exist in this sstable, as they will 39 // be ignored. 40 // 41 // Data blocks are rewritten in parallel by `concurrency` workers and then 42 // assembled into a final SST. Filters are copied from the original SST without 43 // modification as they are not affected by the suffix, while block and table 44 // properties are only minimally recomputed. 45 // 46 // TODO(sumeer): document limitations, if any, due to this limited 47 // re-computation of properties (is there any loss of fidelity?). 48 // 49 // Any block and table property collectors configured in the WriterOptions must 50 // implement SuffixReplaceableTableCollector/SuffixReplaceableBlockCollector. 51 // 52 // The WriterOptions.TableFormat is ignored, and the output sstable has the 53 // same TableFormat as the input, which is returned in case the caller wants 54 // to do some error checking. Suffix rewriting is meant to be efficient, and 55 // allowing changes in the TableFormat detracts from that efficiency. 56 // 57 // Any obsolete bits that key-value pairs may be annotated with are ignored 58 // and lost during the rewrite. Additionally, the output sstable has the 59 // pebble.obsolete.is_strict property set to false. These limitations could be 60 // removed if needed. The current use case for 61 // RewriteKeySuffixesAndReturnFormat in CockroachDB is for MVCC-compliant file 62 // ingestion, where these files do not contain RANGEDELs and have one 63 // key-value pair per userkey -- so they trivially satisfy the strict 64 // criteria, and we don't need the obsolete bit as a performance optimization. 65 // For disaggregated storage, strict obsolete sstables are needed for L5 and 66 // L6, but at the time of writing, we expect such MVCC-compliant file 67 // ingestion to only ingest into levels L4 and higher. If this changes, we can 68 // do one of two things to get rid of this limitation: 69 // - Validate that there are no duplicate userkeys and no RANGEDELs/MERGEs 70 // in the sstable to be rewritten. Validating no duplicate userkeys is 71 // non-trivial when rewriting blocks in parallel, so we could encode the 72 // pre-existing condition in the (existing) SnapshotPinnedKeys property -- 73 // we need to update the external sst writer to calculate and encode this 74 // property. 75 // - Preserve the obsolete bit (with changes to the blockIter). 76 func RewriteKeySuffixesAndReturnFormat( 77 sst []byte, 78 rOpts ReaderOptions, 79 out objstorage.Writable, 80 o WriterOptions, 81 from, to []byte, 82 concurrency int, 83 ) (*WriterMetadata, TableFormat, error) { 84 r, err := NewMemReader(sst, rOpts) 85 if err != nil { 86 return nil, TableFormatUnspecified, err 87 } 88 defer r.Close() 89 return rewriteKeySuffixesInBlocks(r, out, o, from, to, concurrency) 90 } 91 92 func rewriteKeySuffixesInBlocks( 93 r *Reader, out objstorage.Writable, o WriterOptions, from, to []byte, concurrency int, 94 ) (*WriterMetadata, TableFormat, error) { 95 if o.Comparer == nil || o.Comparer.Split == nil { 96 return nil, TableFormatUnspecified, 97 errors.New("a valid splitter is required to rewrite suffixes") 98 } 99 if concurrency < 1 { 100 return nil, TableFormatUnspecified, errors.New("concurrency must be >= 1") 101 } 102 // Even though NumValueBlocks = 0 => NumValuesInValueBlocks = 0, check both 103 // as a defensive measure. 104 if r.Properties.NumValueBlocks > 0 || r.Properties.NumValuesInValueBlocks > 0 { 105 return nil, TableFormatUnspecified, 106 errors.New("sstable with a single suffix should not have value blocks") 107 } 108 109 tableFormat := r.tableFormat 110 o.TableFormat = tableFormat 111 w := NewWriter(out, o) 112 defer func() { 113 if w != nil { 114 w.Close() 115 } 116 }() 117 118 for _, c := range w.propCollectors { 119 if _, ok := c.(SuffixReplaceableTableCollector); !ok { 120 return nil, TableFormatUnspecified, 121 errors.Errorf("property collector %s does not support suffix replacement", c.Name()) 122 } 123 } 124 for _, c := range w.blockPropCollectors { 125 if _, ok := c.(SuffixReplaceableBlockCollector); !ok { 126 return nil, TableFormatUnspecified, 127 errors.Errorf("block property collector %s does not support suffix replacement", c.Name()) 128 } 129 } 130 131 l, err := r.Layout() 132 if err != nil { 133 return nil, TableFormatUnspecified, errors.Wrap(err, "reading layout") 134 } 135 136 if err := rewriteDataBlocksToWriter(r, w, l.Data, from, to, w.split, concurrency); err != nil { 137 return nil, TableFormatUnspecified, errors.Wrap(err, "rewriting data blocks") 138 } 139 140 // Copy over the range key block and replace suffixes in it if it exists. 141 if err := rewriteRangeKeyBlockToWriter(r, w, from, to); err != nil { 142 return nil, TableFormatUnspecified, errors.Wrap(err, "rewriting range key blocks") 143 } 144 145 // Copy over the filter block if it exists (rewriteDataBlocksToWriter will 146 // already have ensured this is valid if it exists). 147 if w.filter != nil && l.Filter.Length > 0 { 148 filterBlock, _, err := readBlockBuf(r, l.Filter, nil) 149 if err != nil { 150 return nil, TableFormatUnspecified, errors.Wrap(err, "reading filter") 151 } 152 w.filter = copyFilterWriter{ 153 origPolicyName: w.filter.policyName(), origMetaName: w.filter.metaName(), data: filterBlock, 154 } 155 } 156 157 if err := w.Close(); err != nil { 158 w = nil 159 return nil, TableFormatUnspecified, err 160 } 161 writerMeta, err := w.Metadata() 162 w = nil 163 return writerMeta, tableFormat, err 164 } 165 166 var errBadKind = errors.New("key does not have expected kind (set)") 167 168 type blockWithSpan struct { 169 start, end InternalKey 170 data []byte 171 } 172 173 func rewriteBlocks( 174 r *Reader, 175 restartInterval int, 176 checksumType ChecksumType, 177 compression Compression, 178 input []BlockHandleWithProperties, 179 output []blockWithSpan, 180 totalWorkers, worker int, 181 from, to []byte, 182 split Split, 183 ) error { 184 bw := blockWriter{ 185 restartInterval: restartInterval, 186 } 187 buf := blockBuf{checksummer: checksummer{checksumType: checksumType}} 188 if checksumType == ChecksumTypeXXHash { 189 buf.checksummer.xxHasher = xxhash.New() 190 } 191 192 var blockAlloc bytealloc.A 193 var keyAlloc bytealloc.A 194 var scratch InternalKey 195 196 var inputBlock, inputBlockBuf []byte 197 198 iter := &blockIter{} 199 200 // We'll assume all blocks are _roughly_ equal so round-robin static partition 201 // of each worker doing every ith block is probably enough. 202 for i := worker; i < len(input); i += totalWorkers { 203 bh := input[i] 204 205 var err error 206 inputBlock, inputBlockBuf, err = readBlockBuf(r, bh.BlockHandle, inputBlockBuf) 207 if err != nil { 208 return err 209 } 210 if err := iter.init(r.Compare, inputBlock, r.Properties.GlobalSeqNum, false); err != nil { 211 return err 212 } 213 214 if cap(bw.restarts) < int(iter.restarts) { 215 bw.restarts = make([]uint32, 0, iter.restarts) 216 } 217 if cap(bw.buf) == 0 { 218 bw.buf = make([]byte, 0, len(inputBlock)) 219 } 220 if cap(bw.restarts) < int(iter.numRestarts) { 221 bw.restarts = make([]uint32, 0, iter.numRestarts) 222 } 223 224 for key, val := iter.First(); key != nil; key, val = iter.Next() { 225 if key.Kind() != InternalKeyKindSet { 226 return errBadKind 227 } 228 si := split(key.UserKey) 229 oldSuffix := key.UserKey[si:] 230 if !bytes.Equal(oldSuffix, from) { 231 err := errors.Errorf("key has suffix %q, expected %q", oldSuffix, from) 232 return err 233 } 234 newLen := si + len(to) 235 if cap(scratch.UserKey) < newLen { 236 scratch.UserKey = make([]byte, 0, len(key.UserKey)*2+len(to)-len(from)) 237 } 238 239 scratch.Trailer = key.Trailer 240 scratch.UserKey = scratch.UserKey[:newLen] 241 copy(scratch.UserKey, key.UserKey[:si]) 242 copy(scratch.UserKey[si:], to) 243 244 // NB: for TableFormatPebblev3 and higher, since 245 // !iter.lazyValueHandling.hasValuePrefix, it will return the raw value 246 // in the block, which includes the 1-byte prefix. This is fine since bw 247 // also does not know about the prefix and will preserve it in bw.add. 248 v := val.InPlaceValue() 249 if invariants.Enabled && r.tableFormat >= TableFormatPebblev3 && 250 key.Kind() == InternalKeyKindSet { 251 if len(v) < 1 { 252 return errors.Errorf("value has no prefix") 253 } 254 prefix := valuePrefix(v[0]) 255 if isValueHandle(prefix) { 256 return errors.Errorf("value prefix is incorrect") 257 } 258 if setHasSamePrefix(prefix) { 259 return errors.Errorf("multiple keys with same key prefix") 260 } 261 } 262 bw.add(scratch, v) 263 if output[i].start.UserKey == nil { 264 keyAlloc, output[i].start = cloneKeyWithBuf(scratch, keyAlloc) 265 } 266 } 267 *iter = iter.resetForReuse() 268 269 keyAlloc, output[i].end = cloneKeyWithBuf(scratch, keyAlloc) 270 271 finished := compressAndChecksum(bw.finish(), compression, &buf) 272 273 // copy our finished block into the output buffer. 274 blockAlloc, output[i].data = blockAlloc.Alloc(len(finished) + blockTrailerLen) 275 copy(output[i].data, finished) 276 copy(output[i].data[len(finished):], buf.tmp[:blockTrailerLen]) 277 } 278 return nil 279 } 280 281 func rewriteDataBlocksToWriter( 282 r *Reader, 283 w *Writer, 284 data []BlockHandleWithProperties, 285 from, to []byte, 286 split Split, 287 concurrency int, 288 ) error { 289 if r.Properties.NumEntries == 0 { 290 // No point keys. 291 return nil 292 } 293 blocks := make([]blockWithSpan, len(data)) 294 295 if w.filter != nil { 296 if r.Properties.FilterPolicyName != w.filter.policyName() { 297 return errors.New("mismatched filters") 298 } 299 if was, is := r.Properties.ComparerName, w.props.ComparerName; was != is { 300 return errors.Errorf("mismatched Comparer %s vs %s, replacement requires same splitter to copy filters", was, is) 301 } 302 } 303 304 g := &sync.WaitGroup{} 305 g.Add(concurrency) 306 errCh := make(chan error, concurrency) 307 for i := 0; i < concurrency; i++ { 308 worker := i 309 go func() { 310 defer g.Done() 311 err := rewriteBlocks( 312 r, 313 w.dataBlockBuf.dataBlock.restartInterval, 314 w.blockBuf.checksummer.checksumType, 315 w.compression, 316 data, 317 blocks, 318 concurrency, 319 worker, 320 from, to, 321 split, 322 ) 323 if err != nil { 324 errCh <- err 325 } 326 }() 327 } 328 g.Wait() 329 close(errCh) 330 if err, ok := <-errCh; ok { 331 return err 332 } 333 334 for _, p := range w.propCollectors { 335 if err := p.(SuffixReplaceableTableCollector).UpdateKeySuffixes(r.Properties.UserProperties, from, to); err != nil { 336 return err 337 } 338 } 339 340 var decoder blockPropertiesDecoder 341 var oldShortIDs []shortID 342 var oldProps [][]byte 343 if len(w.blockPropCollectors) > 0 { 344 oldProps = make([][]byte, len(w.blockPropCollectors)) 345 oldShortIDs = make([]shortID, math.MaxUint8) 346 for i, p := range w.blockPropCollectors { 347 if prop, ok := r.Properties.UserProperties[p.Name()]; ok { 348 was, is := shortID(byte(prop[0])), shortID(i) 349 oldShortIDs[was] = is 350 } 351 } 352 } 353 354 for i := range blocks { 355 // Write the rewritten block to the file. 356 if err := w.writable.Write(blocks[i].data); err != nil { 357 return err 358 } 359 360 n := len(blocks[i].data) 361 bh := BlockHandle{Offset: w.meta.Size, Length: uint64(n) - blockTrailerLen} 362 // Update the overall size. 363 w.meta.Size += uint64(n) 364 365 // Load any previous values for our prop collectors into oldProps. 366 for i := range oldProps { 367 oldProps[i] = nil 368 } 369 decoder.props = data[i].Props 370 for !decoder.done() { 371 id, val, err := decoder.next() 372 if err != nil { 373 return err 374 } 375 oldProps[oldShortIDs[id]] = val 376 } 377 378 for i, p := range w.blockPropCollectors { 379 if err := p.(SuffixReplaceableBlockCollector).UpdateKeySuffixes(oldProps[i], from, to); err != nil { 380 return err 381 } 382 } 383 384 bhp, err := w.maybeAddBlockPropertiesToBlockHandle(bh) 385 if err != nil { 386 return err 387 } 388 var nextKey InternalKey 389 if i+1 < len(blocks) { 390 nextKey = blocks[i+1].start 391 } 392 if err = w.addIndexEntrySync(blocks[i].end, nextKey, bhp, w.dataBlockBuf.tmp[:]); err != nil { 393 return err 394 } 395 } 396 397 w.meta.updateSeqNum(blocks[0].start.SeqNum()) 398 w.props.NumEntries = r.Properties.NumEntries 399 w.props.RawKeySize = r.Properties.RawKeySize 400 w.props.RawValueSize = r.Properties.RawValueSize 401 w.meta.SetSmallestPointKey(blocks[0].start) 402 w.meta.SetLargestPointKey(blocks[len(blocks)-1].end) 403 return nil 404 } 405 406 func rewriteRangeKeyBlockToWriter(r *Reader, w *Writer, from, to []byte) error { 407 iter, err := r.NewRawRangeKeyIter() 408 if err != nil { 409 return err 410 } 411 if iter == nil { 412 // No range keys. 413 return nil 414 } 415 defer iter.Close() 416 417 for s := iter.First(); s != nil; s = iter.Next() { 418 if !s.Valid() { 419 break 420 } 421 for i := range s.Keys { 422 if s.Keys[i].Kind() != base.InternalKeyKindRangeKeySet { 423 return errBadKind 424 } 425 if !bytes.Equal(s.Keys[i].Suffix, from) { 426 return errors.Errorf("key has suffix %q, expected %q", s.Keys[i].Suffix, from) 427 } 428 s.Keys[i].Suffix = to 429 } 430 431 err := rangekey.Encode(s, func(k base.InternalKey, v []byte) error { 432 // Calling AddRangeKey instead of addRangeKeySpan bypasses the fragmenter. 433 // This is okay because the raw fragments off of `iter` are already 434 // fragmented, and suffix replacement should not affect fragmentation. 435 return w.AddRangeKey(k, v) 436 }) 437 if err != nil { 438 return err 439 } 440 } 441 442 return nil 443 } 444 445 type copyFilterWriter struct { 446 origMetaName string 447 origPolicyName string 448 data []byte 449 } 450 451 func (copyFilterWriter) addKey(key []byte) { panic("unimplemented") } 452 func (c copyFilterWriter) finish() ([]byte, error) { return c.data, nil } 453 func (c copyFilterWriter) metaName() string { return c.origMetaName } 454 func (c copyFilterWriter) policyName() string { return c.origPolicyName } 455 456 // RewriteKeySuffixesViaWriter is similar to RewriteKeySuffixes but uses just a 457 // single loop over the Reader that writes each key to the Writer with the new 458 // suffix. The is significantly slower than the parallelized rewriter, and does 459 // more work to rederive filters, props, etc. 460 // 461 // Any obsolete bits that key-value pairs may be annotated with are ignored 462 // and lost during the rewrite. Some of the obsolete bits may be recreated -- 463 // specifically when there are multiple keys with the same user key. 464 // Additionally, the output sstable has the pebble.obsolete.is_strict property 465 // set to false. See the longer comment at RewriteKeySuffixesAndReturnFormat. 466 func RewriteKeySuffixesViaWriter( 467 r *Reader, out objstorage.Writable, o WriterOptions, from, to []byte, 468 ) (*WriterMetadata, error) { 469 if o.Comparer == nil || o.Comparer.Split == nil { 470 return nil, errors.New("a valid splitter is required to rewrite suffixes") 471 } 472 473 o.IsStrictObsolete = false 474 w := NewWriter(out, o) 475 defer func() { 476 if w != nil { 477 w.Close() 478 } 479 }() 480 i, err := r.NewIter(nil, nil) 481 if err != nil { 482 return nil, err 483 } 484 defer i.Close() 485 486 k, v := i.First() 487 var scratch InternalKey 488 for k != nil { 489 if k.Kind() != InternalKeyKindSet { 490 return nil, errors.New("invalid key type") 491 } 492 oldSuffix := k.UserKey[r.Split(k.UserKey):] 493 if !bytes.Equal(oldSuffix, from) { 494 return nil, errors.Errorf("key has suffix %q, expected %q", oldSuffix, from) 495 } 496 scratch.UserKey = append(scratch.UserKey[:0], k.UserKey[:len(k.UserKey)-len(from)]...) 497 scratch.UserKey = append(scratch.UserKey, to...) 498 scratch.Trailer = k.Trailer 499 500 val, _, err := v.Value(nil) 501 if err != nil { 502 return nil, err 503 } 504 if w.addPoint(scratch, val, false); err != nil { 505 return nil, err 506 } 507 k, v = i.Next() 508 } 509 if err := rewriteRangeKeyBlockToWriter(r, w, from, to); err != nil { 510 return nil, err 511 } 512 if err := w.Close(); err != nil { 513 w = nil 514 return nil, err 515 } 516 writerMeta, err := w.Metadata() 517 w = nil 518 return writerMeta, err 519 } 520 521 // NewMemReader opens a reader over the SST stored in the passed []byte. 522 func NewMemReader(sst []byte, o ReaderOptions) (*Reader, error) { 523 return NewReader(newMemReader(sst), o) 524 } 525 526 func readBlockBuf(r *Reader, bh BlockHandle, buf []byte) ([]byte, []byte, error) { 527 raw := r.readable.(*memReader).b[bh.Offset : bh.Offset+bh.Length+blockTrailerLen] 528 if err := checkChecksum(r.checksumType, raw, bh, 0); err != nil { 529 return nil, buf, err 530 } 531 typ := blockType(raw[bh.Length]) 532 raw = raw[:bh.Length] 533 if typ == noCompressionBlockType { 534 return raw, buf, nil 535 } 536 decompressedLen, prefix, err := decompressedLen(typ, raw) 537 if err != nil { 538 return nil, buf, err 539 } 540 if cap(buf) < decompressedLen { 541 buf = make([]byte, decompressedLen) 542 } 543 res, err := decompressInto(typ, raw[prefix:], buf[:decompressedLen]) 544 return res, buf, err 545 } 546 547 // memReader is a thin wrapper around a []byte such that it can be passed to 548 // sstable.Reader. It supports concurrent use, and does so without locking in 549 // contrast to the heavier read/write vfs.MemFile. 550 type memReader struct { 551 b []byte 552 r *bytes.Reader 553 rh objstorage.NoopReadHandle 554 } 555 556 var _ objstorage.Readable = (*memReader)(nil) 557 558 func newMemReader(b []byte) *memReader { 559 r := &memReader{ 560 b: b, 561 r: bytes.NewReader(b), 562 } 563 r.rh = objstorage.MakeNoopReadHandle(r) 564 return r 565 } 566 567 // ReadAt is part of objstorage.Readable. 568 func (m *memReader) ReadAt(_ context.Context, p []byte, off int64) error { 569 n, err := m.r.ReadAt(p, off) 570 if invariants.Enabled && err == nil && n != len(p) { 571 panic("short read") 572 } 573 return err 574 } 575 576 // Close is part of objstorage.Readable. 577 func (*memReader) Close() error { 578 return nil 579 } 580 581 // Stat is part of objstorage.Readable. 582 func (m *memReader) Size() int64 { 583 return int64(len(m.b)) 584 } 585 586 // NewReadHandle is part of objstorage.Readable. 587 func (m *memReader) NewReadHandle(_ context.Context) objstorage.ReadHandle { 588 return &m.rh 589 }